Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from io import StringIO
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- import os
- #import sys, getopt
- from pathlib import Path
- import re
- import time
- def convert(fname, pages=None):
- #Read a pdf, returns its text content as a string
- if not pages:
- pagenums = set()
- else:
- pagenums = set(pages)
- output = StringIO()
- manager = PDFResourceManager()
- converter = TextConverter(manager, output, laparams=LAParams())
- interpreter = PDFPageInterpreter(manager, converter)
- infile = open(fname, 'rb')
- for page in PDFPage.get_pages(infile, pagenums,check_extractable=False):
- interpreter.process_page(page)
- infile.close()
- converter.close()
- text = output.getvalue()
- output.close
- return text
- def convertMultiple(pdfDir, txtDir):
- if pdfDir == "": pdfDir = os.getcwd() + "/" # if no pdfDir passed in
- for pdf in Path(pdfDir).glob(r'**/*.pdf'): # iterate through pdfs in pdf directory
- if '-' in str(pdf): #rename the file if it has "-" in the firename
- pdf_legal=str(pdf).replace('-','_')
- os.rename(str(pdf),str(pdf_legal))
- else:
- pdf_legal=pdf
- fileExtension = str(pdf).split(".")[-1] #i.e. extension after the dot
- if fileExtension == "pdf":
- pdfFilename = str(pdf_legal)
- text = convert(pdfFilename) #get string of text content of pdf
- textFilename = ((txtDir +'/'+ str(pdf).split('\\')[-1]).rstrip(".pdf"))+ ".txt"
- textFile = open(textFilename, "w", encoding='utf-8') # make text file
- textFile.write(text) # write text to text file
- textFile.close()
- # walk through all the sub-folders with specified level(depth) to look through
- def folders_in_path(path):
- if not Path.is_dir(path):
- raise ValueError("argument is not a directory")
- yield from filter(Path.is_dir, path.iterdir())
- def folders_in_depth(path, depth):
- if 0 > depth:
- raise ValueError("depth smaller 0")
- if 0 == depth:
- yield from folders_in_path(path)
- else:
- for folder in folders_in_path(path):
- yield from folders_in_depth(folder, depth-1)
- os.chdir(r"C:/Users/xxxx") #set the working directory
- #print(os.getcwd())
- lst_folders=[]
- evaluators=[]
- #get a list of all the folder names(evaluators)
- for folder in folders_in_depth(Path.cwd(),0):
- lst_folders.append(folder)
- evaluator=str(folder).split("\\")[-1]
- evaluators.append(evaluator)
- start_time = time.time() #start the timer at this point of time
- if __name__ == '__main__':
- for foldername in lst_folders: #iterate over each folder
- evaluator=str(foldername).split("\\")[-1]
- evaluator_folder_path = txtDir+'/'+evaluator
- try: #create a folder for each evaluator
- os.mkdir(evaluator_folder_path)
- print('folder %s has been created'%evaluator_folder_path) #mark the current folder under conversion
- except OSError:
- print ("Creation of the directory %s failed" % foldername)
- convertMultiple(str(foldername), evaluator_folder_path)
- print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement