anki-from-dictionary/dict_to_anki.py

import base64
from typing import Any, Optional

import pytesseract
import requests
from PIL import Image
from io import BytesIO

# OpenAI API Key

import prompt

try:
    with open('apikey.secret') as f:
        api_key = f.read().strip()
except FileNotFoundError:
    print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')


# Function to encode image
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG", quality=95)
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str


# Function to resize the image
def resize_image(input_image_path, max_height, max_width):
    original_image = Image.open(input_image_path)
    width, height = original_image.size

    # resize amount required

    wr = max_width / width
    hr = max_height / height

    if wr < 1 or hr < 1:
        ratio = min(wr, hr)
        image = original_image.resize((int(ratio * width), int(ratio * height)))
    else:
        image = original_image

    return image


def crop_image_to_left_side(image: Image, crop_width) -> Image:
    return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))


# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)


# Function to perform OCR
def ocr(image: Image, lang: Optional[str] = 'eng') -> str:
    text = pytesseract.image_to_string(image, lang=lang)
    return text


def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[
    str | None, Any]:
    images = []
    ocr_results = []

    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for image_path in image_paths:
        resized_image = resize_image(image_path, 1536, 1024)  # seems to work good for these dict pages
        cropped_image = crop_image_to_left_side(resized_image, 512)
        # cropped_image.show()
        # print(cropped_image.size)
        # exit(1)
        base64_image = encode_image(cropped_image)
        images.append(base64_image)
        if do_ocr:
            original_image = Image.open(image_path)
            print("doing local ocr...", end='')
            ocr_text = ocr(original_image, lang)
            print(f" done. local ocr resulted in {len(ocr_text)} characters.")
            # print(ocr_text)  # or save it somewhere, or add it to your payload for further processing
            ocr_results.append(ocr_text)
    # print(resized_image.size)

    # exit(1)

    # generate image payload

    image_msgs = []

    for i, base64_image in enumerate(images):
        image_payload = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
                "detail": "high"
            }
        }
        if do_ocr:
            ocr_payload = {
                "type": "text",
                "text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "
                        "performance:\n " +
                        ocr_results[i]
            }

            image_msgs.append(ocr_payload)

        image_msgs.append(image_payload)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": prompt.SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                               {
                                   "type": "text",
                                   "text": "Transform this image into Anki cards."
                               },
                               # {
                               #     "type": "image_url",
                               #     "image_url": {
                               #         "url": f"data:image/jpeg;base64,{base64_image}"
                               #     }
                               # }
                           ] + image_msgs
            }
        ],
        "max_tokens": 600 * len(images),  # in general, around 350 tokens per page, so around double to be safe
        "temperature": 0.2,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    response_json = response.json()
    # print(response_json)

    # Extracting the completion
    try:
        completion = response_json['choices'][0]['message']['content']
    except KeyError:
        completion = None

    # print(completion)

    return completion, response_json


def test():
    # Path to your image
    # image_paths = "IMG_5334.PNG"
    # image_path = 'tmp.jpg'

    image_path = [
        # './.img/dict.pdf_7.png',
        # './.img/dict.pdf_8.png',
        './.img/dict.pdf_103.png',
    ]

    text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')

    print(text)

    usage = meta['usage']

    print(
        f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
    print(
        f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')

    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075
    print(f'this page: {cost_this}$')


if __name__ == '__main__':
    test()
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`import base64`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`from typing import Any, Optional`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`import pytesseract`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`import requests`
			`from PIL import Image`
			`from io import BytesIO`

			`# OpenAI API Key`

			`import prompt`

			`try:`
			`with open('apikey.secret') as f:`
			`api_key = f.read().strip()`
			`except FileNotFoundError:`
			`print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')`


			`# Function to encode image`
			`def encode_image(image):`
			`buffered = BytesIO()`
			`image.save(buffered, format="JPEG", quality=95)`
			`img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')`
			`return img_str`


			`# Function to resize the image`
			`def resize_image(input_image_path, max_height, max_width):`
			`original_image = Image.open(input_image_path)`
			`width, height = original_image.size`

			`# resize amount required`

			`wr = max_width / width`
			`hr = max_height / height`

			`if wr < 1 or hr < 1:`
			`ratio = min(wr, hr)`
			`image = original_image.resize((int(ratio * width), int(ratio * height)))`
			`else:`
			`image = original_image`

			`return image`


			`def crop_image_to_left_side(image: Image, crop_width) -> Image:`
			`return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))`


			`# Resize the image and get base64 string`
			`# resized_image = resize_image(image_path, 1024, 512)`

Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00
			`# Function to perform OCR`
			`def ocr(image: Image, lang: Optional[str] = 'eng') -> str:`
			`text = pytesseract.image_to_string(image, lang=lang)`
			`return text`


			`def image_to_anki(image_paths: str \| list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[`
			`str \| None, Any]:`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`images = []`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`ocr_results = []`

INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`if isinstance(image_paths, str):`
			`image_paths = [image_paths]`
			`for image_path in image_paths:`
			`resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages`
			`cropped_image = crop_image_to_left_side(resized_image, 512)`
			`# cropped_image.show()`
			`# print(cropped_image.size)`
			`# exit(1)`
			`base64_image = encode_image(cropped_image)`
			`images.append(base64_image)`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`if do_ocr:`
			`original_image = Image.open(image_path)`
			`print("doing local ocr...", end='')`
			`ocr_text = ocr(original_image, lang)`
			`print(f" done. local ocr resulted in {len(ocr_text)} characters.")`
			`# print(ocr_text) # or save it somewhere, or add it to your payload for further processing`
			`ocr_results.append(ocr_text)`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`# print(resized_image.size)`

			`# exit(1)`

Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`# generate image payload`

			`image_msgs = []`

			`for i, base64_image in enumerate(images):`
			`image_payload = {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": f"data:image/jpeg;base64,{base64_image}",`
			`"detail": "high"`
			`}`
			`}`
			`if do_ocr:`
			`ocr_payload = {`
			`"type": "text",`
			`"text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "`
			`"performance:\n " +`
			`ocr_results[i]`
			`}`

			`image_msgs.append(ocr_payload)`

			`image_msgs.append(image_payload)`

INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`headers = {`
			`"Content-Type": "application/json",`
			`"Authorization": f"Bearer {api_key}"`
			`}`

			`payload = {`
			`"model": "gpt-4-vision-preview",`
			`"messages": [`
			`{`
			`"role": "system",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": prompt.SYSTEM_PROMPT,`
			`}`
			`]`
			`},`
			`{`
			`"role": "user",`
			`"content": [`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`{`
			`"type": "text",`
			`"text": "Transform this image into Anki cards."`
			`},`
			`# {`
			`# "type": "image_url",`
			`# "image_url": {`
			`# "url": f"data:image/jpeg;base64,{base64_image}"`
			`# }`
			`# }`
			`] + image_msgs`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`}`
			`],`
			`"max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`"temperature": 0.2,`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`}`

			`response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)`

			`response_json = response.json()`
			`# print(response_json)`

			`# Extracting the completion`
			`try:`
			`completion = response_json['choices'][0]['message']['content']`
			`except KeyError:`
			`completion = None`

			`# print(completion)`

			`return completion, response_json`


			`def test():`
			`# Path to your image`
			`# image_paths = "IMG_5334.PNG"`
			`# image_path = 'tmp.jpg'`

			`image_path = [`
Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`# './.img/dict.pdf_7.png',`
			`# './.img/dict.pdf_8.png',`
			`'./.img/dict.pdf_103.png',`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`]`

Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00
			`print(text)`

			`usage = meta['usage']`

			`print(`
			`f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')`
			`print(`
			`f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')`

Implement local OCR and batch processing CLI flag Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments 2024-02-05 08:47:49 +00:00			`cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075`
INITIAL COMMIT 2024-02-01 11:56:34 +00:00			`print(f'this page: {cost_this}$')`


			`if __name__ == '__main__':`
			`test()`