Update prompt.py to add instructions and improve clarity and performance

The updated prompt adds more detailed and clarified instructions within the script, including specifications about how dictionary content should be handled, and more examples. Unused question and answer examples have been removed. Additional rules regarding dictionary content and OCR text recognition have been included.
Update .gitignore file to include new patterns
2024-02-05 09:49:08 +01:00 · 2024-02-05 09:48:19 +01:00 · 2024-02-05 09:47:49 +01:00
6 changed files with 123 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,5 +8,6 @@ apikey.secret
 *.PNG

 out.md
-out.old.md
+out.*.md
+*.old.*
 /__pycache__
--- a/dict_to_anki.py
+++ b/dict_to_anki.py
@ -1,6 +1,7 @@
 import base64
-from typing import Any
+from typing import Any, Optional

+import pytesseract
 import requests
 from PIL import Image
 from io import BytesIO
@ -50,8 +51,18 @@ def crop_image_to_left_side(image: Image, crop_width) -> Image:
 # Resize the image and get base64 string
 # resized_image = resize_image(image_path, 1024, 512)

-def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
+
+# Function to perform OCR
+def ocr(image: Image, lang: Optional[str] = 'eng') -> str:
+    text = pytesseract.image_to_string(image, lang=lang)
+    return text
+
+
+def image_to_anki(image_paths: str | list[str], do_ocr: bool = False, lang: Optional[str] = None) -> tuple[
+    str | None, Any]:
    images = []
+    ocr_results = []
+
    if isinstance(image_paths, str):
        image_paths = [image_paths]
    for image_path in image_paths:
@ -62,11 +73,41 @@ def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
        # exit(1)
        base64_image = encode_image(cropped_image)
        images.append(base64_image)
+        if do_ocr:
+            original_image = Image.open(image_path)
+            print("doing local ocr...", end='')
+            ocr_text = ocr(original_image, lang)
+            print(f" done. local ocr resulted in {len(ocr_text)} characters.")
+            # print(ocr_text)  # or save it somewhere, or add it to your payload for further processing
+            ocr_results.append(ocr_text)
    # print(resized_image.size)

-
    # exit(1)

+    # generate image payload
+
+    image_msgs = []
+
+    for i, base64_image in enumerate(images):
+        image_payload = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image}",
+                "detail": "high"
+            }
+        }
+        if do_ocr:
+            ocr_payload = {
+                "type": "text",
+                "text": "Here are OCR results for the following page. These might be flawed. Use them to improve your "
+                        "performance:\n " +
+                        ocr_results[i]
+            }
+
+            image_msgs.append(ocr_payload)
+
+        image_msgs.append(image_payload)
+
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
@ -97,18 +138,11 @@ def image_to_anki(image_paths: str | list[str]) -> tuple[str | None, Any]:
                               #         "url": f"data:image/jpeg;base64,{base64_image}"
                               #     }
                               # }
-                ] + [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_image}",
-                        "detail": "high"
-                    }
-                }
-                    for base64_image in images]
+                           ] + image_msgs
            }
        ],
        "max_tokens": 600 * len(images),  # in general, around 350 tokens per page, so around double to be safe
-        "temperature": 0.0,
+        "temperature": 0.2,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
@ -133,11 +167,12 @@ def test():
    # image_path = 'tmp.jpg'

    image_path = [
-        './.img/dict.pdf_7.png',
-        './.img/dict.pdf_8.png',
+        # './.img/dict.pdf_7.png',
+        # './.img/dict.pdf_8.png',
+        './.img/dict.pdf_103.png',
    ]

-    text, meta = image_to_anki(image_path)
+    text, meta = image_to_anki(image_path, do_ocr=False, lang='eng+chi_sim')

    print(text)

@ -148,10 +183,9 @@ def test():
    print(
        f'approx. cost: 0.0075$ per picture, {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens and {usage["completion_tokens"] * 0.01 / 1000}$ for completion tokens')

-    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.01 / 1000 + 0.0075
+    cost_this = usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000 # + 0.0075
    print(f'this page: {cost_this}$')


-
 if __name__ == '__main__':
    test()
--- a/main.py
+++ b/main.py
@ -18,6 +18,8 @@ def main():
    parser.add_argument('--pages', type=str, required=True, help='Specify pages to parse in format <num>-<num>')
    parser.add_argument('--output-file', type=str, default='out.md', help='Specify output file')
    parser.add_argument('--images-path', type=str, default='./.img/', help='Specify output file')
+    parser.add_argument('--ocr', type=str, default=None, help='If present, send ocr=true to the image_to_anki method, and give the string value to the lang parameter')
+    parser.add_argument('--batch-size', type=int, default=3, help='Decide how many pages are processed in parallel')
    parser.add_argument('pdf_file', type=str, help='Specify PDF file name')

    args = parser.parse_args()
@ -62,11 +64,12 @@ def main():

    break_outer = False

-    for i in range(len(paths) // IMGS_PER_REQUEST + 1):
+    for i in range(len(paths) // args.batch_size + 1):  # the batch size argument is used here
        # print(i)
+
        # collect images
        while True:
-            to_process = paths[i * IMGS_PER_REQUEST:i * IMGS_PER_REQUEST + IMGS_PER_REQUEST]
+            to_process = paths[i * args.batch_size:i * args.batch_size + args.batch_size]  # the batch size argument is used here
            # print(to_process)
            if len(to_process) == 0:
                # skip if remaining list is empty (e.g. if 4 pages at package size 2)
@ -74,7 +77,9 @@ def main():

            print(f'processing {len(to_process)} image{"s" if len(to_process) != 1 else ""}')

-            cards, meta = dict_to_anki.image_to_anki(to_process)
+            ocr = True if args.ocr else False  # set OCR to True if --ocr parameter is present
+
+            cards, meta = dict_to_anki.image_to_anki(to_process, do_ocr=ocr, lang=args.ocr)

            if not cards:
                print("Error processing! Response: " + meta)
--- a/poetry.lock
+++ b/poetry.lock
@ -269,6 +269,17 @@ typing-extensions = ">=4.7,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]

+[[package]]
+name = "packaging"
+version = "23.2"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
+    {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
+]
+
 [[package]]
 name = "pdf2image"
 version = "1.17.0"
@ -478,6 +489,21 @@ files = [
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"

+[[package]]
+name = "pytesseract"
+version = "0.3.10"
+description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
+    {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
+]
+
+[package.dependencies]
+packaging = ">=21.3"
+Pillow = ">=8.0.0"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@ -561,4 +587,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "1df31140161c62d430257e30b1ebbff75524b5614888dfc7809f90d5f09a5737"
+content-hash = "07e8002d23153d51441fa4c4a70af0d6022d127f2c6c9c900cb194741e9bbe6c"
--- a/prompt.py
+++ b/prompt.py
@ -3,19 +3,10 @@ You are AnkiBot, a program that converts dictionary pictures into Anki cards.
 You will get Dictionary Images as input, and you will write Anki cards as an output.
 Anki cards follow the following format:

+
 Q: How do you use this style?
 A: Just like this.

-Q: Can the question
-run over multiple lines?
-A: Yes, and
-So can the answer
-
-Q: Does the answer need to be immediately after the question?
-
-
-A: No, and preceding whitespace will be ignored.
-
 Q: How is this possible?
 A: The 'magic' of regular expressions!

@ -39,11 +30,29 @@ A: 公务员
 Q: der Job -s
 A: 工作

-ONLY write down the words in the dictionary. Output NOTHING ELSE than the words in the dictionary.
-If a page does not contain any words (e.g. grammar info, title, ...), SKIP THAT PAGE and do not write down ANYTHING for it.
-You have perfect OCR for roman letters, and chinese characters, and never make a mistake.
-Make sure to write Anki cards for EVERY word. DO NOT leave any out. Try to always use the chinese words used in the dictionary, don't reword.
-DO NOT modify the dictionary content, just transform the entries as-is into Anki cards.
+Q: eine(r, s) (PRON)
+A: 一人
+
+Q: euch (PRON)
+A: 你们(三格, 四格) 
+
+Q: irgendwelche(r, s) (PRON)
+A: 任何一个, 某物, 不知哪些, 某些, 任何一些
+
+You are programmed to ALWAYS follow these instructions:
+- ONLY write down the words in the dictionary. Output NOTHING ELSE than the words in the dictionary.
+- If a page does not contain any words (e.g. grammar info, title, ...), SKIP THAT PAGE and do not write down ANYTHING for it.
+- Always use EXACTLY the characters in the dictionary. ONLY translage free-hand IF AND ONLY IF chinese is unrecognizable AND OCR (if available) didn't work.
+- Case descriptions from the dictionary (e.g. 你(三格)) shall be written down AS-IS, and NOT be changed into something like 你(宾格)
+- Make sure to write Anki cards for EVERY word. DO NOT leave any out. 
+- DO NOT modify the dictionary content, just transform the entries as-is into Anki cards.
+- If words are separated by a | (e.g. zu|lassen), MAKE SURE to also write down that |.
+- Add (ADJ) to adjectives, and (ADV) to adverbs, as well as other bracket content if written in the dictionary.
+- You might get OCR text for each image. If so, assume that the OCR text is the raw, unprocessed output of the OCR tool. It might be imperfect, or formatted wrongly. Use the OCR content to improve your performance TOGETHER with the provided image, while keeping its constraints in mind. ALWAYS prefer OCR text recognition if available.
+
+Remember, do NOT reword dictionary content!
+
+
 '''

 # DO NOT reword or rewrite chinese translation, just copy them from the dictionary!
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ openai = "^1.10.0"
 requests = "^2.31.0"
 pillow = "^10.2.0"
 pdf2image = "^1.17.0"
+pytesseract = "^0.3.10"


 [build-system]