2024-02-01 11:56:34 +00:00
import argparse
import os
from pdf2image import convert_from_path
import dict_to_anki # ensure dict_to_anki is imported
IMGS_PER_REQUEST = 2
def is_parsable_to_int ( input_string ) :
try :
int ( input_string )
return True
except ValueError :
return False
def main ( ) :
parser = argparse . ArgumentParser ( description = ' Parse PDF pages to MD. ' )
parser . add_argument ( ' --pages ' , type = str , required = True , help = ' Specify pages to parse in format <num>-<num> ' )
parser . add_argument ( ' --output-file ' , type = str , default = ' out.md ' , help = ' Specify output file ' )
parser . add_argument ( ' --images-path ' , type = str , default = ' ./.img/ ' , help = ' Specify output file ' )
2024-02-05 17:36:44 +00:00
parser . add_argument ( ' --ocr ' , type = str , default = None , help = ' languages to use for local OCR, e.g. deu+chi_sim for german and simplified chinese (tesseract langpacks needed) ' )
2024-02-05 08:47:49 +00:00
parser . add_argument ( ' --batch-size ' , type = int , default = 3 , help = ' Decide how many pages are processed in parallel ' )
2024-02-01 11:56:34 +00:00
parser . add_argument ( ' pdf_file ' , type = str , help = ' Specify PDF file name ' )
args = parser . parse_args ( )
# check if page is a single digit
if is_parsable_to_int ( args . pages ) :
start_page = int ( args . pages )
end_page = start_page
else :
# parse pages into start and end page
start_page , end_page = map ( int , args . pages . split ( ' - ' ) )
# todo: validate output file
# Create the images directory if it doesn't exist
os . makedirs ( args . images_path , exist_ok = True )
# Create output file or throw error if not possible
try :
with open ( args . output_file , ' w ' ) as file :
pass
except OSError :
raise Exception ( " Couldn ' t create output file " )
# convert PDF to images for the given page
images = convert_from_path ( args . pdf_file , first_page = start_page , last_page = end_page )
# for each image, run dict_to_anki.convert(filename)
out = ' '
cost = 0.0
paths = [ ]
for i , image in enumerate ( images , start = start_page ) :
print ( f ' extracting image for page { i } ... ' )
image_path = f " { args . images_path . rstrip ( ' / ' ) } / { args . pdf_file } _ { i } .png "
image . save ( image_path , ' PNG ' )
paths . append ( image_path )
break_outer = False
2024-02-05 08:47:49 +00:00
for i in range ( len ( paths ) / / args . batch_size + 1 ) : # the batch size argument is used here
2024-02-01 11:56:34 +00:00
# print(i)
2024-02-05 08:47:49 +00:00
2024-02-01 11:56:34 +00:00
# collect images
while True :
2024-02-05 08:47:49 +00:00
to_process = paths [ i * args . batch_size : i * args . batch_size + args . batch_size ] # the batch size argument is used here
2024-02-01 11:56:34 +00:00
# print(to_process)
if len ( to_process ) == 0 :
# skip if remaining list is empty (e.g. if 4 pages at package size 2)
break
print ( f ' processing { len ( to_process ) } image { " s " if len ( to_process ) != 1 else " " } ' )
2024-02-05 08:47:49 +00:00
ocr = True if args . ocr else False # set OCR to True if --ocr parameter is present
cards , meta = dict_to_anki . image_to_anki ( to_process , do_ocr = ocr , lang = args . ocr )
2024-02-01 11:56:34 +00:00
if not cards :
2024-02-05 14:07:25 +00:00
print ( " Error processing! Response: " + str ( meta ) )
2024-02-01 11:56:34 +00:00
user_response = input ( " Retry? [y/N] > " )
if user_response != ' y ' or ' yes ' or ' Y ' or ' YES ' or ' Yes ' :
break_outer = True
break
else :
continue
# usage logging
usage = meta [ ' usage ' ]
print ( f ' usage for page { i } : \n { usage [ " prompt_tokens " ] } prompt tokens and { usage [ " completion_tokens " ] } completion tokens ' )
print ( f ' approx. cost: , { usage [ " prompt_tokens " ] * 0.01 / 1000 } $ for prompt tokens (pictures approx { 0.00745 * len ( to_process ) } $ of this), { usage [ " completion_tokens " ] * 0.03 / 1000 } $ for completion tokens ' )
cost_this = ( usage [ " prompt_tokens " ] * 0.01 / 1000
+ usage [ " completion_tokens " ] * 0.03 / 1000 )
cost + = cost_this
print ( f ' this page: { cost_this } $, total: { cost } $ ' )
out + = cards + ' \n \n \n '
break
if break_outer :
break
print ( " total cost: " , cost )
with open ( args . output_file , ' w ' ) as file :
file . write ( out )
if __name__ == " __main__ " :
main ( )