2024-02-01 11:56:34 +00:00
import base64
from typing import Any
import requests
from PIL import Image
from io import BytesIO
# OpenAI API Key
import prompt
try :
with open ( ' apikey.secret ' ) as f :
api_key = f . read ( ) . strip ( )
except FileNotFoundError :
print ( ' Couldn \' t read API key from file \' apikey.secret \' w. Does it exist? ' )
# Function to encode image
def encode_image ( image ) :
buffered = BytesIO ( )
image . save ( buffered , format = " JPEG " , quality = 95 )
img_str = base64 . b64encode ( buffered . getvalue ( ) ) . decode ( ' utf-8 ' )
return img_str
# Function to resize the image
def resize_image ( input_image_path , max_height , max_width ) :
original_image = Image . open ( input_image_path )
width , height = original_image . size
# resize amount required
wr = max_width / width
hr = max_height / height
if wr < 1 or hr < 1 :
ratio = min ( wr , hr )
image = original_image . resize ( ( int ( ratio * width ) , int ( ratio * height ) ) )
else :
image = original_image
return image
def crop_image_to_left_side ( image : Image , crop_width ) - > Image :
return image . crop ( ( 0 , 0 , min ( image . size [ 0 ] , crop_width ) , image . size [ 1 ] ) )
# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)
def image_to_anki ( image_paths : str | list [ str ] ) - > tuple [ str | None , Any ] :
images = [ ]
if isinstance ( image_paths , str ) :
image_paths = [ image_paths ]
for image_path in image_paths :
resized_image = resize_image ( image_path , 1536 , 1024 ) # seems to work good for these dict pages
cropped_image = crop_image_to_left_side ( resized_image , 512 )
# cropped_image.show()
# print(cropped_image.size)
# exit(1)
base64_image = encode_image ( cropped_image )
images . append ( base64_image )
# print(resized_image.size)
# exit(1)
headers = {
" Content-Type " : " application/json " ,
" Authorization " : f " Bearer { api_key } "
}
payload = {
" model " : " gpt-4-vision-preview " ,
" messages " : [
{
" role " : " system " ,
" content " : [
{
" type " : " text " ,
" text " : prompt . SYSTEM_PROMPT ,
}
]
} ,
{
" role " : " user " ,
" content " : [
{
" type " : " text " ,
" text " : " Transform this image into Anki cards. "
} ,
# {
# "type": "image_url",
# "image_url": {
# "url": f"data:image/jpeg;base64,{base64_image}"
# }
# }
] + [ {
" type " : " image_url " ,
" image_url " : {
2024-02-02 12:47:38 +00:00
" url " : f " data:image/jpeg;base64, { base64_image } " ,
" detail " : " high "
2024-02-01 11:56:34 +00:00
}
}
for base64_image in images ]
}
] ,
" max_tokens " : 600 * len ( images ) , # in general, around 350 tokens per page, so around double to be safe
" temperature " : 0.0 ,
}
response = requests . post ( " https://api.openai.com/v1/chat/completions " , headers = headers , json = payload )
response_json = response . json ( )
# print(response_json)
# Extracting the completion
try :
completion = response_json [ ' choices ' ] [ 0 ] [ ' message ' ] [ ' content ' ]
except KeyError :
completion = None
# print(completion)
return completion , response_json
def test ( ) :
# Path to your image
# image_paths = "IMG_5334.PNG"
# image_path = 'tmp.jpg'
image_path = [
' ./.img/dict.pdf_7.png ' ,
' ./.img/dict.pdf_8.png ' ,
]
text , meta = image_to_anki ( image_path )
print ( text )
usage = meta [ ' usage ' ]
print (
f ' usage for page: \n { usage [ " prompt_tokens " ] } prompt tokens and { usage [ " completion_tokens " ] } completion tokens ' )
print (
f ' approx. cost: 0.0075$ per picture, { usage [ " prompt_tokens " ] * 0.01 / 1000 } $ for prompt tokens and { usage [ " completion_tokens " ] * 0.01 / 1000 } $ for completion tokens ' )
cost_this = usage [ " prompt_tokens " ] * 0.01 / 1000 + usage [ " completion_tokens " ] * 0.01 / 1000 + 0.0075
print ( f ' this page: { cost_this } $ ' )
if __name__ == ' __main__ ' :
test ( )