2024-02-01 11:56:34 +00:00
import base64
2024-02-05 08:47:49 +00:00
from typing import Any , Optional
2024-02-01 11:56:34 +00:00
2024-02-05 08:47:49 +00:00
import pytesseract
2024-02-01 11:56:34 +00:00
import requests
from PIL import Image
from io import BytesIO
# OpenAI API Key
import prompt
try :
with open ( ' apikey.secret ' ) as f :
api_key = f . read ( ) . strip ( )
except FileNotFoundError :
print ( ' Couldn \' t read API key from file \' apikey.secret \' w. Does it exist? ' )
# Function to encode image
def encode_image ( image ) :
buffered = BytesIO ( )
image . save ( buffered , format = " JPEG " , quality = 95 )
img_str = base64 . b64encode ( buffered . getvalue ( ) ) . decode ( ' utf-8 ' )
return img_str
# Function to resize the image
def resize_image ( input_image_path , max_height , max_width ) :
original_image = Image . open ( input_image_path )
width , height = original_image . size
# resize amount required
wr = max_width / width
hr = max_height / height
if wr < 1 or hr < 1 :
ratio = min ( wr , hr )
image = original_image . resize ( ( int ( ratio * width ) , int ( ratio * height ) ) )
else :
image = original_image
return image
def crop_image_to_left_side ( image : Image , crop_width ) - > Image :
return image . crop ( ( 0 , 0 , min ( image . size [ 0 ] , crop_width ) , image . size [ 1 ] ) )
# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)
2024-02-05 08:47:49 +00:00
# Function to perform OCR
def ocr ( image : Image , lang : Optional [ str ] = ' eng ' ) - > str :
text = pytesseract . image_to_string ( image , lang = lang )
return text
def image_to_anki ( image_paths : str | list [ str ] , do_ocr : bool = False , lang : Optional [ str ] = None ) - > tuple [
str | None , Any ] :
2024-02-01 11:56:34 +00:00
images = [ ]
2024-02-05 08:47:49 +00:00
ocr_results = [ ]
2024-02-01 11:56:34 +00:00
if isinstance ( image_paths , str ) :
image_paths = [ image_paths ]
for image_path in image_paths :
resized_image = resize_image ( image_path , 1536 , 1024 ) # seems to work good for these dict pages
cropped_image = crop_image_to_left_side ( resized_image , 512 )
# cropped_image.show()
# print(cropped_image.size)
# exit(1)
base64_image = encode_image ( cropped_image )
images . append ( base64_image )
2024-02-05 08:47:49 +00:00
if do_ocr :
original_image = Image . open ( image_path )
print ( " doing local ocr... " , end = ' ' )
ocr_text = ocr ( original_image , lang )
print ( f " done. local ocr resulted in { len ( ocr_text ) } characters. " )
# print(ocr_text) # or save it somewhere, or add it to your payload for further processing
ocr_results . append ( ocr_text )
2024-02-01 11:56:34 +00:00
# print(resized_image.size)
# exit(1)
2024-02-05 08:47:49 +00:00
# generate image payload
image_msgs = [ ]
for i , base64_image in enumerate ( images ) :
image_payload = {
" type " : " image_url " ,
" image_url " : {
" url " : f " data:image/jpeg;base64, { base64_image } " ,
" detail " : " high "
}
}
if do_ocr :
ocr_payload = {
" type " : " text " ,
" text " : " Here are OCR results for the following page. These might be flawed. Use them to improve your "
" performance: \n " +
ocr_results [ i ]
}
image_msgs . append ( ocr_payload )
image_msgs . append ( image_payload )
2024-02-01 11:56:34 +00:00
headers = {
" Content-Type " : " application/json " ,
" Authorization " : f " Bearer { api_key } "
}
payload = {
" model " : " gpt-4-vision-preview " ,
" messages " : [
{
" role " : " system " ,
" content " : [
{
" type " : " text " ,
" text " : prompt . SYSTEM_PROMPT ,
}
]
} ,
{
" role " : " user " ,
" content " : [
2024-02-05 08:47:49 +00:00
{
" type " : " text " ,
" text " : " Transform this image into Anki cards. "
} ,
# {
# "type": "image_url",
# "image_url": {
# "url": f"data:image/jpeg;base64,{base64_image}"
# }
# }
] + image_msgs
2024-02-01 11:56:34 +00:00
}
] ,
" max_tokens " : 600 * len ( images ) , # in general, around 350 tokens per page, so around double to be safe
2024-02-05 08:47:49 +00:00
" temperature " : 0.2 ,
2024-02-01 11:56:34 +00:00
}
response = requests . post ( " https://api.openai.com/v1/chat/completions " , headers = headers , json = payload )
response_json = response . json ( )
# print(response_json)
# Extracting the completion
try :
completion = response_json [ ' choices ' ] [ 0 ] [ ' message ' ] [ ' content ' ]
except KeyError :
completion = None
# print(completion)
return completion , response_json
def test ( ) :
# Path to your image
# image_paths = "IMG_5334.PNG"
# image_path = 'tmp.jpg'
image_path = [
2024-02-05 08:47:49 +00:00
# './.img/dict.pdf_7.png',
# './.img/dict.pdf_8.png',
' ./.img/dict.pdf_103.png ' ,
2024-02-01 11:56:34 +00:00
]
2024-02-05 08:47:49 +00:00
text , meta = image_to_anki ( image_path , do_ocr = False , lang = ' eng+chi_sim ' )
2024-02-01 11:56:34 +00:00
print ( text )
usage = meta [ ' usage ' ]
print (
f ' usage for page: \n { usage [ " prompt_tokens " ] } prompt tokens and { usage [ " completion_tokens " ] } completion tokens ' )
print (
f ' approx. cost: 0.0075$ per picture, { usage [ " prompt_tokens " ] * 0.01 / 1000 } $ for prompt tokens and { usage [ " completion_tokens " ] * 0.01 / 1000 } $ for completion tokens ' )
2024-02-05 08:47:49 +00:00
cost_this = usage [ " prompt_tokens " ] * 0.01 / 1000 + usage [ " completion_tokens " ] * 0.03 / 1000 # + 0.0075
2024-02-01 11:56:34 +00:00
print ( f ' this page: { cost_this } $ ' )
if __name__ == ' __main__ ' :
test ( )