Compare commits

..

No commits in common. "83ec148b85b9cec040c9def64343f55396908866" and "3d09cbbef8cb40ff900fe577d1ec5d0728b06fb0" have entirely different histories.

6 changed files with 47 additions and 123 deletions

3
.gitignore vendored
View File

@ -8,6 +8,5 @@ apikey.secret
*.PNG
out.md
out.*.md
*.old.*
out.old.md
/__pycache__

View File

@ -1,7 +1,6 @@
import base64
from typing import Any, Optional
from typing import Any
import pytesseract
import requests
from PIL import Image
from io import BytesIO
@ -51,18 +50,8 @@ def crop_image_to_left_side(image: Image, crop_width) -> Image:
# Resize the image and get base64 string
# resized_image = resize_image(image_path, 1024, 512)
# Function to perform OCR
def ocr(image: Image, lang: Optional[str] = 'eng') -> str:
text = pytesseract.image_to_string(image, lang=lang)
return text
def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[
str | None, Any]:
def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
images = []
ocr_results = []
if isinstance(image_paths, str):
image_paths = [image_paths]
for image_path in image_paths:
@ -73,41 +62,11 @@ def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Opti
# exit(1)
base64_image = encode_image(cropped_image)
images.append(base64_image)
if do_ocr:
original_image = Image.open(image_path)
print("doing local ocr...", end='')
ocr_text = ocr(original_image, lang)
print(f" done. local ocr resulted in {len(ocr_text)} characters.")
# print(ocr_text) # or save it somewhere, or add it to your payload for further processing
ocr_results.append(ocr_text)
# print(resized_image.size)
# exit(1)
# generate image payload
image_msgs = []
for i, base64_image in enumerate(images):
image_payload = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
if do_ocr:
ocr_payload = {
"type": "text",
"text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "
"performance:\n " +
ocr_results[i]
}
image_msgs.append(ocr_payload)
image_msgs.append(image_payload)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
@ -138,11 +97,18 @@ def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Opti
# "url": f"data:image/jpeg;base64,{base64_image}"
# }
# }
] + image_msgs
] + [{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
for base64_image in images]
}
],
"max_tokens": 600 * len(images), # in general, around 350 tokens per page, so around double to be safe
"temperature": 0.2,
"temperature": 0.0,
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
@ -167,12 +133,11 @@ def test():
# image_path = 'tmp.jpg'
image_path = [
# './.img/dict.pdf_7.png',
# './.img/dict.pdf_8.png',
'./.img/dict.pdf_103.png',
'./.img/dict.pdf_7.png',
'./.img/dict.pdf_8.png',
]
text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')
text, meta = image_to_anki(image_path)
print(text)
@ -183,9 +148,10 @@ def test():
print(
f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')
cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075
cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075
print(f'this page: {cost_this}$')
if __name__ == '__main__':
test()

11
main.py
View File

@ -18,8 +18,6 @@ def main():
parser.add_argument('--pages', type=str, required=True, help='Specify pages to parse in format <num>-<num>')
parser.add_argument('--output-file', type=str, default='out.md', help='Specify output file')
parser.add_argument('--images-path', type=str, default='./.img/', help='Specify output file')
parser.add_argument('--ocr', type=str, default=None, help='If present, send ocr=true to the image_to_anki method, and give the string value to the lang parameter')
parser.add_argument('--batch-size', type=int, default=3, help='Decide how many pages are processed in parallel')
parser.add_argument('pdf_file', type=str, help='Specify PDF file name')
args = parser.parse_args()
@ -64,12 +62,11 @@ def main():
break_outer = False
for i in range(len(paths) // args.batch_size + 1): # the batch size argument is used here
for i in range(len(paths) // IMGS_PER_REQUEST + 1):
# print(i)
# collect images
while True:
to_process = paths[i * args.batch_size:i * args.batch_size + args.batch_size] # the batch size argument is used here
to_process = paths[i * IMGS_PER_REQUEST:i * IMGS_PER_REQUEST + IMGS_PER_REQUEST]
# print(to_process)
if len(to_process) == 0:
# skip if remaining list is empty (e.g. if 4 pages at package size 2)
@ -77,9 +74,7 @@ def main():
print(f'processing {len(to_process)} image{"s" if len(to_process) != 1 else ""}')
ocr = True if args.ocr else False # set OCR to True if --ocr parameter is present
cards, meta = dict_to_anki.image_to_anki(to_process, do_ocr=ocr, lang=args.ocr)
cards, meta = dict_to_anki.image_to_anki(to_process)
if not cards:
print("Error processing! Response: " + meta)

28
poetry.lock generated
View File

@ -269,17 +269,6 @@ typing-extensions = ">=4.7,<5"
[package.extras]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
[[package]]
name = "packaging"
version = "23.2"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.7"
files = [
{file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
{file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
]
[[package]]
name = "pdf2image"
version = "1.17.0"
@ -489,21 +478,6 @@ files = [
[package.dependencies]
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]]
name = "requests"
version = "2.31.0"
@ -587,4 +561,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "07e8002d23153d51441fa4c4a70af0d6022d127f2c6c9c900cb194741e9bbe6c"
content-hash = "1df31140161c62d430257e30b1ebbff75524b5614888dfc7809f90d5f09a5737"

View File

@ -3,10 +3,19 @@ You are AnkiBot, a program that converts dictionary pictures into Anki cards.
You will get Dictionary Images as input, and you will write Anki cards as an output.
Anki cards follow the following format:
Q: How do you use this style?
A: Just like this.
Q: Can the question
run over multiple lines?
A: Yes, and
So can the answer
Q: Does the answer need to be immediately after the question?
A: No, and preceding whitespace will be ignored.
Q: How is this possible?
A: The 'magic' of regular expressions!
@ -30,29 +39,11 @@ A: 公务员
Q: der Job -s
A: 工作
Q: eine(r, s) (PRON)
A: 一人
Q: euch (PRON)
A: 你们(三格, 四格)
Q: irgendwelche(r, s) (PRON)
A: 任何一个, 某物, 不知哪些, 某些, 任何一些
You are programmed to ALWAYS follow these instructions:
- ONLY write down the words in the dictionary. Output NOTHING ELSE than the words in the dictionary.
- If a page does not contain any words (e.g. grammar info, title, ...), SKIP THAT PAGE and do not write down ANYTHING for it.
- Always use EXACTLY the characters in the dictionary. ONLY translage free-hand IF AND ONLY IF chinese is unrecognizable AND OCR (if available) didn't work.
- Case descriptions from the dictionary (e.g. (三格)) shall be written down AS-IS, and NOT be changed into something like (宾格)
- Make sure to write Anki cards for EVERY word. DO NOT leave any out.
- DO NOT modify the dictionary content, just transform the entries as-is into Anki cards.
- If words are separated by a | (e.g. zu|lassen), MAKE SURE to also write down that |.
- Add (ADJ) to adjectives, and (ADV) to adverbs, as well as other bracket content if written in the dictionary.
- You might get OCR text for each image. If so, assume that the OCR text is the raw, unprocessed output of the OCR tool. It might be imperfect, or formatted wrongly. Use the OCR content to improve your performance TOGETHER with the provided image, while keeping its constraints in mind. ALWAYS prefer OCR text recognition if available.
Remember, do NOT reword dictionary content!
ONLY write down the words in the dictionary. Output NOTHING ELSE than the words in the dictionary.
If a page does not contain any words (e.g. grammar info, title, ...), SKIP THAT PAGE and do not write down ANYTHING for it.
You have perfect OCR for roman letters, and chinese characters, and never make a mistake.
Make sure to write Anki cards for EVERY word. DO NOT leave any out. Try to always use the chinese words used in the dictionary, don't reword.
DO NOT modify the dictionary content, just transform the entries as-is into Anki cards.
'''
# DO NOT reword or rewrite chinese translation, just copy them from the dictionary!

View File

@ -12,7 +12,6 @@ openai = "^1.10.0"
requests = "^2.31.0"
pillow = "^10.2.0"
pdf2image = "^1.17.0"
pytesseract = "^0.3.10"
[build-system]