anki-from-dictionary/dict_to_anki.py

import base64
from typing import Any, Optional

import pytesseract
import requests
from PIL import Image
from io import BytesIO

# OpenAI API Key

import prompt

try:
    with open('apikey.secret') as f:
        api_key = f.read().strip()
except FileNotFoundError:
    print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')


# Function to encode image
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG", quality=95)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str


# Function to resize the image
def resize_image(input_image_path, max_height, max_width):
    original_image = Image.open(input_image_path)
    width, height = original_image.size

    # resize amount required

    wr = max_width / width
    hr = max_height / height

    if wr < 1 or hr < 1:
        ratio = min(wr, hr)
        image = original_image.resize((int(ratio * width), int(ratio * height)))
    else:
        image = original_image

    return image


def crop_image_to_left_side(image: Image, crop_width) -> Image:
    return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))


# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)


# Function to perform OCR
def ocr(image: Image, lang: Optional[str] = 'eng') -> str:
    text = pytesseract.image_to_string(image, lang=lang)
    return text


def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[
    str | None, Any]:
    images = []
    ocr_results = []

    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for image_path in image_paths:
        resized_image = resize_image(image_path, 1536, 1024)  # seems to work good for these dict pages
        cropped_image = crop_image_to_left_side(resized_image, 512)
        # cropped_image.show()
        # print(cropped_image.size)
        # exit(1)
        base64_image = encode_image(cropped_image)
        images.append(base64_image)
        if do_ocr:
            original_image = Image.open(image_path)
            print("doing local ocr...", end='')
            ocr_text = ocr(original_image, lang)
            print(f" done. local ocr resulted in {len(ocr_text)} characters.")
            # print(ocr_text)  # or save it somewhere, or add it to your payload for further processing
            ocr_results.append(ocr_text)
    # print(resized_image.size)

    # exit(1)

    # generate image payload

    image_msgs = []

    for i, base64_image in enumerate(images):
        image_payload = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
                "detail": "high"
            }
        }
        if do_ocr:
            ocr_payload = {
                "type": "text",
                "text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "
                        "performance:\n " +
                        ocr_results[i]
            }

            image_msgs.append(ocr_payload)

        image_msgs.append(image_payload)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": prompt.SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                               {
                                   "type": "text",
                                   "text": "Transform this image into Anki cards."
                               },
                               # {
                               #     "type": "image_url",
                               #     "image_url": {
                               #         "url": f"data:image/jpeg;base64,{base64_image}"
                               #     }
                               # }
                           ] + image_msgs
            }
        ],
        "max_tokens": 600 * len(images),  # in general, around 350 tokens per page, so around double to be safe
        "temperature": 0.2,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_json = response.json()
    # print(response_json)

    # Extracting the completion
    try:
        completion = response_json['choices'][0]['message']['content']
    except KeyError:
        completion = None

    # print(completion)

    return completion, response_json


def test():
    # Path to your image
    # image_paths = "IMG_5334.PNG"
    # image_path = 'tmp.jpg'

    image_path = [
        # './.img/dict.pdf_7.png',
        # './.img/dict.pdf_8.png',
        './.img/dict.pdf_103.png',
    ]

    text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')

    print(text)

    usage = meta['usage']

    print(
        f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
    print(
        f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')

    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075
    print(f'this page: {cost_this}$')


if __name__ == '__main__':
    test()