anki-from-dictionary/main.py

118 lines
4.1 KiB
Python
Raw Permalink Normal View History

2024-02-01 11:56:34 +00:00
import argparse
import os
from pdf2image import convert_from_path
import dict_to_anki # ensure dict_to_anki is imported
IMGS_PER_REQUEST = 2
def is_parsable_to_int(input_string):
try:
int(input_string)
return True
except ValueError:
return False
def main():
parser = argparse.ArgumentParser(description='Parse PDF pages to MD.')
parser.add_argument('--pages', type=str, required=True, help='Specify pages to parse in format <num>-<num>')
parser.add_argument('--output-file', type=str, default='out.md', help='Specify output file')
parser.add_argument('--images-path', type=str, default='./.img/', help='Specify output file')
2024-02-05 17:36:44 +00:00
parser.add_argument('--ocr', type=str, default=None, help='languages to use for local OCR, e.g. deu+chi_sim for german and simplified chinese (tesseract langpacks needed)')
parser.add_argument('--batch-size', type=int, default=3, help='Decide how many pages are processed in parallel')
2024-02-01 11:56:34 +00:00
parser.add_argument('pdf_file', type=str, help='Specify PDF file name')
args = parser.parse_args()
# check if page is a single digit
if is_parsable_to_int(args.pages):
start_page = int(args.pages)
end_page = start_page
else:
# parse pages into start and end page
start_page, end_page = map(int, args.pages.split('-'))
# todo: validate output file
# Create the images directory if it doesn't exist
os.makedirs(args.images_path, exist_ok=True)
# Create output file or throw error if not possible
try:
with open(args.output_file, 'w') as file:
pass
except OSError:
raise Exception("Couldn't create output file")
# convert PDF to images for the given page
images = convert_from_path(args.pdf_file, first_page=start_page, last_page=end_page)
# for each image, run dict_to_anki.convert(filename)
out = ''
cost = 0.0
paths = []
for i, image in enumerate(images, start=start_page):
print(f'extracting image for page {i}...')
image_path = f"{args.images_path.rstrip('/')}/{args.pdf_file}_{i}.png"
image.save(image_path, 'PNG')
paths.append(image_path)
break_outer = False
for i in range(len(paths) // args.batch_size + 1): # the batch size argument is used here
2024-02-01 11:56:34 +00:00
# print(i)
2024-02-01 11:56:34 +00:00
# collect images
while True:
to_process = paths[i * args.batch_size:i * args.batch_size + args.batch_size] # the batch size argument is used here
2024-02-01 11:56:34 +00:00
# print(to_process)
if len(to_process) == 0:
# skip if remaining list is empty (e.g. if 4 pages at package size 2)
break
print(f'processing {len(to_process)} image{"s" if len(to_process) != 1 else ""}')
ocr = True if args.ocr else False # set OCR to True if --ocr parameter is present
cards, meta = dict_to_anki.image_to_anki(to_process, do_ocr=ocr, lang=args.ocr)
2024-02-01 11:56:34 +00:00
if not cards:
print("Error processing! Response: " + str(meta))
2024-02-01 11:56:34 +00:00
user_response = input("Retry? [y/N] > ")
if user_response != 'y' or 'yes' or 'Y' or 'YES' or 'Yes':
break_outer = True
break
else:
continue
# usage logging
usage = meta['usage']
print(f'usage for page {i}:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens')
print(f'approx. cost: , {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens (pictures approx {0.00745 * len(to_process)}$ of this), {usage["completion_tokens"] * 0.03 / 1000}$ for completion tokens')
cost_this = (usage["prompt_tokens"] * 0.01 / 1000
+ usage["completion_tokens"] * 0.03 / 1000)
cost += cost_this
print(f'this page: {cost_this}$, total: {cost}$')
out += cards + '\n\n\n'
break
if break_outer:
break
print("total cost:", cost)
with open(args.output_file, 'w') as file:
file.write(out)
if __name__ == "__main__":
main()