import base64 from typing import Any, Optional import pytesseract import requests from PIL import Image from io import BytesIO # OpenAI API Key import prompt try: with open('apikey.secret') as f: api_key = f.read().strip() except FileNotFoundError: print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?') # Function to encode image def encode_image(image): buffered = BytesIO() image.save(buffered, format="JPEG", quality=95) img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') return img_str # Function to resize the image def resize_image(input_image_path, max_height, max_width): original_image = Image.open(input_image_path) width, height = original_image.size # resize amount required wr = max_width / width hr = max_height / height if wr < 1 or hr < 1: ratio = min(wr, hr) image = original_image.resize((int(ratio * width), int(ratio * height))) else: image = original_image return image def crop_image_to_left_side(image: Image, crop_width) -> Image: return image.crop((0, 0, min(image.size[0], crop_width), image.size[1])) # Resize the image and get base64 string # resized_image = resize_image(image_path, 1024, 512) # Function to perform OCR def ocr(image: Image, lang: Optional[str] = 'eng') -> str: text = pytesseract.image_to_string(image, lang=lang) return text def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[ str | None, Any]: images = [] ocr_results = [] if isinstance(image_paths, str): image_paths = [image_paths] for image_path in image_paths: resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages cropped_image = crop_image_to_left_side(resized_image, 512) # cropped_image.show() # print(cropped_image.size) # exit(1) base64_image = encode_image(cropped_image) images.append(base64_image) if do_ocr: original_image = Image.open(image_path) print("doing local ocr...", end='') ocr_text = ocr(original_image, lang) print(f" done. local ocr resulted in {len(ocr_text)} characters.") # print(ocr_text) # or save it somewhere, or add it to your payload for further processing ocr_results.append(ocr_text) # print(resized_image.size) # exit(1) # generate image payload image_msgs = [] for i, base64_image in enumerate(images): image_payload = { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", "detail": "high" } } if do_ocr: ocr_payload = { "type": "text", "text": "Here are OCR results for the following page. These might be flawed. Use them to improve your " "performance:\n " + ocr_results[i] } image_msgs.append(ocr_payload) image_msgs.append(image_payload) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "system", "content": [ { "type": "text", "text": prompt.SYSTEM_PROMPT, } ] }, { "role": "user", "content": [ { "type": "text", "text": "Transform this image into Anki cards." }, # { # "type": "image_url", # "image_url": { # "url": f"data:image/jpeg;base64,{base64_image}" # } # } ] + image_msgs } ], "max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe "temperature": 0.2, } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) response_json = response.json() # print(response_json) # Extracting the completion try: completion = response_json['choices'][0]['message']['content'] except KeyError: completion = None # print(completion) return completion, response_json def test(): # Path to your image # image_paths = "IMG_5334.PNG" # image_path = 'tmp.jpg' image_path = [ # './.img/dict.pdf_7.png', # './.img/dict.pdf_8.png', './.img/dict.pdf_103.png', ] text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim') print(text) usage = meta['usage'] print( f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens') print( f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens') cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075 print(f'this page: {cost_this}$') if __name__ == '__main__': test()