import base64 from typing import Any import requests from PIL import Image from io import BytesIO # OpenAI API Key import prompt try: with open('apikey.secret') as f: api_key = f.read().strip() except FileNotFoundError: print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?') # Function to encode image def encode_image(image): buffered = BytesIO() image.save(buffered, format="JPEG", quality=95) img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') return img_str # Function to resize the image def resize_image(input_image_path, max_height, max_width): original_image = Image.open(input_image_path) width, height = original_image.size # resize amount required wr = max_width / width hr = max_height / height if wr < 1 or hr < 1: ratio = min(wr, hr) image = original_image.resize((int(ratio * width), int(ratio * height))) else: image = original_image return image def crop_image_to_left_side(image: Image, crop_width) -> Image: return image.crop((0, 0, min(image.size[0], crop_width), image.size[1])) # Resize the image and get base64 string # resized_image = resize_image(image_path, 1024, 512) def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]: images = [] if isinstance(image_paths, str): image_paths = [image_paths] for image_path in image_paths: resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages cropped_image = crop_image_to_left_side(resized_image, 512) # cropped_image.show() # print(cropped_image.size) # exit(1) base64_image = encode_image(cropped_image) images.append(base64_image) # print(resized_image.size) # exit(1) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "system", "content": [ { "type": "text", "text": prompt.SYSTEM_PROMPT, } ] }, { "role": "user", "content": [ { "type": "text", "text": "Transform this image into Anki cards." }, # { # "type": "image_url", # "image_url": { # "url": f"data:image/jpeg;base64,{base64_image}" # } # } ] + [{ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } for base64_image in images] } ], "max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe "temperature": 0.0, } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) response_json = response.json() # print(response_json) # Extracting the completion try: completion = response_json['choices'][0]['message']['content'] except KeyError: completion = None # print(completion) return completion, response_json def test(): # Path to your image # image_paths = "IMG_5334.PNG" # image_path = 'tmp.jpg' image_path = [ './.img/dict.pdf_7.png', './.img/dict.pdf_8.png', ] text, meta = image_to_anki(image_path) print(text) usage = meta['usage'] print( f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens') print( f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens') cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075 print(f'this page: {cost_this}$') if __name__ == '__main__': test()