anki-from-dictionary/dict_to_anki.py

import base64
from typing import Any

import requests
from PIL import Image
from io import BytesIO

# OpenAI API Key

import prompt

try:
    with open('apikey.secret') as f:
        api_key = f.read().strip()
except FileNotFoundError:
    print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')


# Function to encode image
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG", quality=95)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str


# Function to resize the image
def resize_image(input_image_path, max_height, max_width):
    original_image = Image.open(input_image_path)
    width, height = original_image.size

    # resize amount required

    wr = max_width / width
    hr = max_height / height

    if wr < 1 or hr < 1:
        ratio = min(wr, hr)
        image = original_image.resize((int(ratio * width), int(ratio * height)))
    else:
        image = original_image

    return image


def crop_image_to_left_side(image: Image, crop_width) -> Image:
    return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))


# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)

def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
    images = []
    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for image_path in image_paths:
        resized_image = resize_image(image_path, 1536, 1024)  # seems to work good for these dict pages
        cropped_image = crop_image_to_left_side(resized_image, 512)
        # cropped_image.show()
        # print(cropped_image.size)
        # exit(1)
        base64_image = encode_image(cropped_image)
        images.append(base64_image)
    # print(resized_image.size)


    # exit(1)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": prompt.SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Transform this image into Anki cards."
                    },
                    # {
                    #     "type": "image_url",
                    #     "image_url": {
                    #         "url": f"data:image/jpeg;base64,{base64_image}"
                    #     }
                    # }
                ] + [{
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
                    for base64_image in images]
            }
        ],
        "max_tokens": 600 * len(images),  # in general, around 350 tokens per page, so around double to be safe
        "temperature": 0.0,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_json = response.json()
    # print(response_json)

    # Extracting the completion
    try:
        completion = response_json['choices'][0]['message']['content']
    except KeyError:
        completion = None

    # print(completion)

    return completion, response_json


def test():
    # Path to your image
    # image_paths = "IMG_5334.PNG"
    # image_path = 'tmp.jpg'

    image_path = [
        './.img/dict.pdf_7.png',
        './.img/dict.pdf_8.png',
    ]

    text, meta = image_to_anki(image_path)

    print(text)

    usage = meta['usage']

    print(
        f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
    print(
        f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')

    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075
    print(f'this page: {cost_this}$')


if __name__ == '__main__':
    test()