Yandrik
d9eb6f1c64
Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments
192 lines
5.5 KiB
Python
192 lines
5.5 KiB
Python
import base64
|
|
from typing import Any, Optional
|
|
|
|
import pytesseract
|
|
import requests
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
|
|
# OpenAI API Key
|
|
|
|
import prompt
|
|
|
|
try:
|
|
with open('apikey.secret') as f:
|
|
api_key = f.read().strip()
|
|
except FileNotFoundError:
|
|
print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')
|
|
|
|
|
|
# Function to encode image
|
|
def encode_image(image):
|
|
buffered = BytesIO()
|
|
image.save(buffered, format="JPEG", quality=95)
|
|
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
return img_str
|
|
|
|
|
|
# Function to resize the image
|
|
def resize_image(input_image_path, max_height, max_width):
|
|
original_image = Image.open(input_image_path)
|
|
width, height = original_image.size
|
|
|
|
# resize amount required
|
|
|
|
wr = max_width / width
|
|
hr = max_height / height
|
|
|
|
if wr < 1 or hr < 1:
|
|
ratio = min(wr, hr)
|
|
image = original_image.resize((int(ratio * width), int(ratio * height)))
|
|
else:
|
|
image = original_image
|
|
|
|
return image
|
|
|
|
|
|
def crop_image_to_left_side(image: Image, crop_width) -> Image:
|
|
return image.crop((0, 0, min(image.size[0], crop_width), image.size[1]))
|
|
|
|
|
|
# Resize the image and get base64 string
|
|
# resized_image = resize_image(image_path, 1024, 512)
|
|
|
|
|
|
# Function to perform OCR
|
|
def ocr(image: Image, lang: Optional[str] = 'eng') -> str:
|
|
text = pytesseract.image_to_string(image, lang=lang)
|
|
return text
|
|
|
|
|
|
def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[
|
|
str | None, Any]:
|
|
images = []
|
|
ocr_results = []
|
|
|
|
if isinstance(image_paths, str):
|
|
image_paths = [image_paths]
|
|
for image_path in image_paths:
|
|
resized_image = resize_image(image_path, 1536, 1024) # seems to work good for these dict pages
|
|
cropped_image = crop_image_to_left_side(resized_image, 512)
|
|
# cropped_image.show()
|
|
# print(cropped_image.size)
|
|
# exit(1)
|
|
base64_image = encode_image(cropped_image)
|
|
images.append(base64_image)
|
|
if do_ocr:
|
|
original_image = Image.open(image_path)
|
|
print("doing local ocr...", end='')
|
|
ocr_text = ocr(original_image, lang)
|
|
print(f" done. local ocr resulted in {len(ocr_text)} characters.")
|
|
# print(ocr_text) # or save it somewhere, or add it to your payload for further processing
|
|
ocr_results.append(ocr_text)
|
|
# print(resized_image.size)
|
|
|
|
# exit(1)
|
|
|
|
# generate image payload
|
|
|
|
image_msgs = []
|
|
|
|
for i, base64_image in enumerate(images):
|
|
image_payload = {
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": f"data:image/jpeg;base64,{base64_image}",
|
|
"detail": "high"
|
|
}
|
|
}
|
|
if do_ocr:
|
|
ocr_payload = {
|
|
"type": "text",
|
|
"text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "
|
|
"performance:\n " +
|
|
ocr_results[i]
|
|
}
|
|
|
|
image_msgs.append(ocr_payload)
|
|
|
|
image_msgs.append(image_payload)
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {api_key}"
|
|
}
|
|
|
|
payload = {
|
|
"model": "gpt-4-vision-preview",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": prompt.SYSTEM_PROMPT,
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "Transform this image into Anki cards."
|
|
},
|
|
# {
|
|
# "type": "image_url",
|
|
# "image_url": {
|
|
# "url": f"data:image/jpeg;base64,{base64_image}"
|
|
# }
|
|
# }
|
|
] + image_msgs
|
|
}
|
|
],
|
|
"max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe
|
|
"temperature": 0.2,
|
|
}
|
|
|
|
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
|
|
|
response_json = response.json()
|
|
# print(response_json)
|
|
|
|
# Extracting the completion
|
|
try:
|
|
completion = response_json['choices'][0]['message']['content']
|
|
except KeyError:
|
|
completion = None
|
|
|
|
# print(completion)
|
|
|
|
return completion, response_json
|
|
|
|
|
|
def test():
|
|
# Path to your image
|
|
# image_paths = "IMG_5334.PNG"
|
|
# image_path = 'tmp.jpg'
|
|
|
|
image_path = [
|
|
# './.img/dict.pdf_7.png',
|
|
# './.img/dict.pdf_8.png',
|
|
'./.img/dict.pdf_103.png',
|
|
]
|
|
|
|
text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')
|
|
|
|
print(text)
|
|
|
|
usage = meta['usage']
|
|
|
|
print(
|
|
f'usage for page:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
|
|
print(
|
|
f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')
|
|
|
|
cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075
|
|
print(f'this page: {cost_this}$')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
test()
|