anki-from-dictionary/dict_to_anki.py

import base64
from typing import Any

import requests
from PIL import Image
from io import BytesIO

# OpenAI API Key

import prompt

try:
    with open('apikey.secret') as f:
        api_key = f.read().strip()
except FileNotFoundError:
    print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')


# Function to encode image
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG", quality=95)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str


# Function to resize the image
def resize_image(input_image_path, max_height, max_width):
    original_image = Image.open(input_image_path)
    width, height = original_image.size

    # resize amount required

    wr = max_width / width
    hr = max_height / height

    if wr < 1 or hr < 1:
        ratio = min(wr, hr)
        image = original_image.resize((int(ratio * width), int(ratio * height)))
    else:
        image = original_image

    return image


def crop_image_to_left_side(image: Image, crop_width) -> Image:
    return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))


# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)

def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
    images = []
    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for image_path in image_paths:
        resized_image = resize_image(image_path, 1536, 1024)  # seems to work good for these dict pages
        cropped_image = crop_image_to_left_side(resized_image, 512)
        # cropped_image.show()
        # print(cropped_image.size)
        # exit(1)
        base64_image = encode_image(cropped_image)
        images.append(base64_image)
    # print(resized_image.size)


    # exit(1)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": prompt.SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Transform this image into Anki cards."
                    },
                    # {
                    #     "type": "image_url",
                    #     "image_url": {
                    #         "url": f"data:image/jpeg;base64,{base64_image}"
                    #     }
                    # }
                ] + [{
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "high"
                    }
                }
                    for base64_image in images]
            }
        ],
        "max_tokens": 600 * len(images),  # in general, around 350 tokens per page, so around double to be safe
        "temperature": 0.0,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_json = response.json()
    # print(response_json)

    # Extracting the completion
    try:
        completion = response_json['choices'][0]['message']['content']
    except KeyError:
        completion = None

    # print(completion)

    return completion, response_json


def test():
    # Path to your image
    # image_paths = "IMG_5334.PNG"
    # image_path = 'tmp.jpg'

    image_path = [
        './.img/dict.pdf_7.png',
        './.img/dict.pdf_8.png',
    ]

    text, meta = image_to_anki(image_path)

    print(text)

    usage = meta['usage']

    print(
        f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
    print(
        f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')

    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075
    print(f'this page: {cost_this}$')


if __name__ == '__main__':
    test()
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`import base64`
			`from typing import Any`

			`import requests`
			`from PIL import Image`
			`from io import BytesIO`

			`# OpenAI API Key`

			`import prompt`

			`try:`
			`with open('apikey.secret') as f:`
			`api_key = f.read().strip()`
			`except FileNotFoundError:`
			`print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')`


			`# Function to encode image`
			`def encode_image(image):`
			`buffered = BytesIO()`
			`image.save(buffered, format="JPEG", quality=95)`
			`img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')`
			`return img_str`


			`# Function to resize the image`
			`def resize_image(input_image_path, max_height, max_width):`
			`original_image = Image.open(input_image_path)`
			`width, height = original_image.size`

			`# resize amount required`

			`wr = max_width / width`
			`hr = max_height / height`

			`if wr < 1 or hr < 1:`
			`ratio = min(wr, hr)`
			`image = original_image.resize((int(ratio * width), int(ratio * height)))`
			`else:`
			`image = original_image`

			`return image`


			`def crop_image_to_left_side(image: Image, crop_width) -> Image:`
			`return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))`


			`# Resize the image and get base64 string`
			`# resized_image = resize_image(image_path, 1024, 512)`

			`def image_to_anki(image_paths: str \| list[str]) -> tuple[str \| None, Any]:`
			`images = []`
			`if isinstance(image_paths, str):`
			`image_paths = [image_paths]`
			`for image_path in image_paths:`
			`resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages`
			`cropped_image = crop_image_to_left_side(resized_image, 512)`
			`# cropped_image.show()`
			`# print(cropped_image.size)`
			`# exit(1)`
			`base64_image = encode_image(cropped_image)`
			`images.append(base64_image)`
			`# print(resized_image.size)`


			`# exit(1)`

			`headers = {`
			`"Content-Type": "application/json",`
			`"Authorization": f"Bearer {api_key}"`
			`}`

			`payload = {`
			`"model": "gpt-4-vision-preview",`
			`"messages": [`
			`{`
			`"role": "system",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": prompt.SYSTEM_PROMPT,`
			`}`
			`]`
			`},`
			`{`
			`"role": "user",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": "Transform this image into Anki cards."`
			`},`
			`# {`
			`# "type": "image_url",`
			`# "image_url": {`
			`# "url": f"data:image/jpeg;base64,{base64_image}"`
			`# }`
			`# }`
			`] + [{`
			`"type": "image_url",`
			`"image_url": {`
improvement: added "detail": "high" to image for chatgpt's benefit 2024-02-02 12:47:38 +00:00			`"url": f"data:image/jpeg;base64,{base64_image}",`
			`"detail": "high"`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`}`
			`}`
			`for base64_image in images]`
			`}`
			`],`
			`"max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe`
			`"temperature": 0.0,`
			`}`

			`response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)`

			`response_json = response.json()`
			`# print(response_json)`

			`# Extracting the completion`
			`try:`
			`completion = response_json['choices'][0]['message']['content']`
			`except KeyError:`
			`completion = None`

			`# print(completion)`

			`return completion, response_json`


			`def test():`
			`# Path to your image`
			`# image_paths = "IMG_5334.PNG"`
			`# image_path = 'tmp.jpg'`

			`image_path = [`
			`'./.img/dict.pdf_7.png',`
			`'./.img/dict.pdf_8.png',`
			`]`

			`text, meta = image_to_anki(image_path)`

			`print(text)`

			`usage = meta['usage']`

			`print(`
			`f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')`
			`print(`
			`f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')`

			`cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075`
			`print(f'this page: {cost_this}$')`



			`if __name__ == '__main__':`
			`test()`