anki-from-dictionary/dict_to_anki.py
2024-02-01 12:56:34 +01:00

157 lines
4.3 KiB
Python

import base64
from typing import Any
import requests
from PIL import Image
from io import BytesIO
# OpenAI API Key
import prompt
try:
with open('apikey.secret') as f:
api_key = f.read().strip()
except FileNotFoundError:
print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')
# Function to encode image
def encode_image(image):
buffered = BytesIO()
image.save(buffered, format="JPEG", quality=95)
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
return img_str
# Function to resize the image
def resize_image(input_image_path, max_height, max_width):
original_image = Image.open(input_image_path)
width, height = original_image.size
# resize amount required
wr = max_width / width
hr = max_height / height
if wr < 1 or hr < 1:
ratio = min(wr, hr)
image = original_image.resize((int(ratio * width), int(ratio * height)))
else:
image = original_image
return image
def crop_image_to_left_side(image: Image, crop_width) -> Image:
return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))
# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)
def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
images = []
if isinstance(image_paths, str):
image_paths = [image_paths]
for image_path in image_paths:
resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages
cropped_image = crop_image_to_left_side(resized_image, 512)
# cropped_image.show()
# print(cropped_image.size)
# exit(1)
base64_image = encode_image(cropped_image)
images.append(base64_image)
# print(resized_image.size)
# exit(1)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": prompt.SYSTEM_PROMPT,
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Transform this image into Anki cards."
},
# {
# "type": "image_url",
# "image_url": {
# "url": f"data:image/jpeg;base64,{base64_image}"
# }
# }
] + [{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
for base64_image in images]
}
],
"max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe
"temperature": 0.0,
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response_json = response.json()
# print(response_json)
# Extracting the completion
try:
completion = response_json['choices'][0]['message']['content']
except KeyError:
completion = None
# print(completion)
return completion, response_json
def test():
# Path to your image
# image_paths = "IMG_5334.PNG"
# image_path = 'tmp.jpg'
image_path = [
'./.img/dict.pdf_7.png',
'./.img/dict.pdf_8.png',
]
text, meta = image_to_anki(image_path)
print(text)
usage = meta['usage']
print(
f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
print(
f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')
cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075
print(f'this page: {cost_this}$')
if __name__ == '__main__':
test()