import argparse import os from pdf2image import convert_from_path import dict_to_anki # ensure dict_to_anki is imported IMGS_PER_REQUEST = 2 def is_parsable_to_int(input_string): try: int(input_string) return True except ValueError: return False def main(): parser = argparse.ArgumentParser(description='Parse PDF pages to MD.') parser.add_argument('--pages', type=str, required=True, help='Specify pages to parse in format -') parser.add_argument('--output-file', type=str, default='out.md', help='Specify output file') parser.add_argument('--images-path', type=str, default='./.img/', help='Specify output file') parser.add_argument('--ocr', type=str, default=None, help='languages to use for local OCR, e.g. deu+chi_sim for german and simplified chinese (tesseract langpacks needed)') parser.add_argument('--batch-size', type=int, default=3, help='Decide how many pages are processed in parallel') parser.add_argument('pdf_file', type=str, help='Specify PDF file name') args = parser.parse_args() # check if page is a single digit if is_parsable_to_int(args.pages): start_page = int(args.pages) end_page = start_page else: # parse pages into start and end page start_page, end_page = map(int, args.pages.split('-')) # todo: validate output file # Create the images directory if it doesn't exist os.makedirs(args.images_path, exist_ok=True) # Create output file or throw error if not possible try: with open(args.output_file, 'w') as file: pass except OSError: raise Exception("Couldn't create output file") # convert PDF to images for the given page images = convert_from_path(args.pdf_file, first_page=start_page, last_page=end_page) # for each image, run dict_to_anki.convert(filename) out = '' cost = 0.0 paths = [] for i, image in enumerate(images, start=start_page): print(f'extracting image for page {i}...') image_path = f"{args.images_path.rstrip('/')}/{args.pdf_file}_{i}.png" image.save(image_path, 'PNG') paths.append(image_path) break_outer = False for i in range(len(paths) // args.batch_size + 1): # the batch size argument is used here # print(i) # collect images while True: to_process = paths[i * args.batch_size:i * args.batch_size + args.batch_size] # the batch size argument is used here # print(to_process) if len(to_process) == 0: # skip if remaining list is empty (e.g. if 4 pages at package size 2) break print(f'processing {len(to_process)} image{"s" if len(to_process) != 1 else ""}') ocr = True if args.ocr else False # set OCR to True if --ocr parameter is present cards, meta = dict_to_anki.image_to_anki(to_process, do_ocr=ocr, lang=args.ocr) if not cards: print("Error processing! Response: " + str(meta)) user_response = input("Retry? [y/N] > ") if user_response != 'y' or 'yes' or 'Y' or 'YES' or 'Yes': break_outer = True break else: continue # usage logging usage = meta['usage'] print(f'usage for page {i}:\n{usage["prompt_tokens"]} prompt tokens and {usage["completion_tokens"]} completion tokens') print(f'approx. cost: , {usage["prompt_tokens"] * 0.01 / 1000}$ for prompt tokens (pictures approx {0.00745 * len(to_process)}$ of this), {usage["completion_tokens"] * 0.03 / 1000}$ for completion tokens') cost_this = (usage["prompt_tokens"] * 0.01 / 1000 + usage["completion_tokens"] * 0.03 / 1000) cost += cost_this print(f'this page: {cost_this}$, total: {cost}$') out += cards + '\n\n\n' break if break_outer: break print("total cost:", cost) with open(args.output_file, 'w') as file: file.write(out) if __name__ == "__main__": main()