Yandrik
d9eb6f1c64
Implemented optical character recognition (OCR) in the image_to_anki function to vastly enhance performance. Additionally, allowed batch processing of images via explicitly specified batch size in command-line arguments
118 lines
4.1 KiB
Python
118 lines
4.1 KiB
Python
import argparse
|
|
import os
|
|
from pdf2image import convert_from_path
|
|
import dict_to_anki # ensure dict_to_anki is imported
|
|
|
|
|
|
IMGS_PER_REQUEST = 2
|
|
|
|
def is_parsable_to_int(input_string):
|
|
try:
|
|
int(input_string)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Parse PDF pages to MD.')
|
|
parser.add_argument('--pages', type=str, required=True, help='Specify pages to parse in format <num>-<num>')
|
|
parser.add_argument('--output-file', type=str, default='out.md', help='Specify output file')
|
|
parser.add_argument('--images-path', type=str, default='./.img/', help='Specify output file')
|
|
parser.add_argument('--ocr', type=str, default=None, help='If present, send ocr=true to the image_to_anki method, and give the string value to the lang parameter')
|
|
parser.add_argument('--batch-size', type=int, default=3, help='Decide how many pages are processed in parallel')
|
|
parser.add_argument('pdf_file', type=str, help='Specify PDF file name')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# check if page is a single digit
|
|
|
|
if is_parsable_to_int(args.pages):
|
|
start_page = int(args.pages)
|
|
end_page = start_page
|
|
else:
|
|
# parse pages into start and end page
|
|
start_page, end_page = map(int, args.pages.split('-'))
|
|
|
|
# todo: validate output file
|
|
|
|
# Create the images directory if it doesn't exist
|
|
os.makedirs(args.images_path, exist_ok=True)
|
|
|
|
# Create output file or throw error if not possible
|
|
try:
|
|
with open(args.output_file, 'w') as file:
|
|
pass
|
|
except OSError:
|
|
raise Exception("Couldn't create output file")
|
|
|
|
# convert PDF to images for the given page
|
|
images = convert_from_path(args.pdf_file, first_page=start_page, last_page=end_page)
|
|
|
|
# for each image, run dict_to_anki.convert(filename)
|
|
|
|
out = ''
|
|
|
|
cost = 0.0
|
|
|
|
paths = []
|
|
|
|
for i, image in enumerate(images, start=start_page):
|
|
print(f'extracting image for page {i}...')
|
|
image_path = f"{args.images_path.rstrip('/')}/{args.pdf_file}_{i}.png"
|
|
image.save(image_path, 'PNG')
|
|
paths.append(image_path)
|
|
|
|
break_outer = False
|
|
|
|
for i in range(len(paths) // args.batch_size + 1): # the batch size argument is used here
|
|
# print(i)
|
|
|
|
# collect images
|
|
while True:
|
|
to_process = paths[i * args.batch_size:i * args.batch_size + args.batch_size] # the batch size argument is used here
|
|
# print(to_process)
|
|
if len(to_process) == 0:
|
|
# skip if remaining list is empty (e.g. if 4 pages at package size 2)
|
|
break
|
|
|
|
print(f'processing {len(to_process)} image{"s" if len(to_process) != 1 else ""}')
|
|
|
|
ocr = True if args.ocr else False # set OCR to True if --ocr parameter is present
|
|
|
|
cards, meta = dict_to_anki.image_to_anki(to_process, do_ocr=ocr, lang=args.ocr)
|
|
|
|
if not cards:
|
|
print("Error processing! Response: " + meta)
|
|
user_response = input("Retry? [y/N] > ")
|
|
if user_response != 'y' or 'yes' or 'Y' or 'YES' or 'Yes':
|
|
break_outer = True
|
|
break
|
|
else:
|
|
continue
|
|
|
|
|
|
# usage logging
|
|
usage = meta['usage']
|
|
print(f'usage for page {i}:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
|
|
print(f'approx. cost: , {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens (pictures approx {0.00745 * len(to_process)}$ of this), {usage["completion_tokens"] * 0.03 / 1000}$ for completion tokens')
|
|
|
|
cost_this = (usage["prompt_tokens"] * 0.01 / 1000
|
|
+ usage["completion_tokens"] * 0.03 / 1000)
|
|
cost += cost_this
|
|
print(f'this page: {cost_this}$, total: {cost}$')
|
|
|
|
out += cards + '\n\n\n'
|
|
break
|
|
|
|
if break_outer:
|
|
break
|
|
|
|
|
|
print("total cost:", cost)
|
|
|
|
with open(args.output_file, 'w') as file:
|
|
file.write(out)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|