SHARE
TWEET

Untitled

a guest Aug 20th, 2019 96 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from io import StringIO
  2. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  3. from pdfminer.converter import TextConverter
  4. from pdfminer.layout import LAParams
  5. from pdfminer.pdfpage import PDFPage
  6. import os
  7. #import sys, getopt
  8. from pathlib import Path
  9. import re
  10. import time
  11.  
  12. def convert(fname, pages=None):
  13.     #Read a pdf, returns its text content as a string
  14.     if not pages:
  15.         pagenums = set()
  16.     else:
  17.         pagenums = set(pages)
  18.  
  19.     output = StringIO()
  20.     manager = PDFResourceManager()
  21.     converter = TextConverter(manager, output, laparams=LAParams())
  22.     interpreter = PDFPageInterpreter(manager, converter)
  23.  
  24.     infile = open(fname, 'rb')
  25.  
  26.     for page in PDFPage.get_pages(infile, pagenums,check_extractable=False):
  27.         interpreter.process_page(page)
  28.     infile.close()
  29.     converter.close()
  30.     text = output.getvalue()
  31.     output.close
  32.     return text
  33.  
  34. def convertMultiple(pdfDir, txtDir):
  35.     if pdfDir == "": pdfDir = os.getcwd() + "/"  # if no pdfDir passed in
  36.     for pdf in Path(pdfDir).glob(r'**/*.pdf'):   # iterate through pdfs in pdf directory
  37.         if '-' in str(pdf):                      #rename the file if it has "-" in the firename
  38.             pdf_legal=str(pdf).replace('-','_')
  39.             os.rename(str(pdf),str(pdf_legal))
  40.         else:
  41.             pdf_legal=pdf
  42.         fileExtension = str(pdf).split(".")[-1] #i.e. extension after the dot
  43.         if fileExtension == "pdf":
  44.             pdfFilename = str(pdf_legal)
  45.             text = convert(pdfFilename)         #get string of text content of pdf
  46.             textFilename = ((txtDir +'/'+ str(pdf).split('\\')[-1]).rstrip(".pdf"))+ ".txt"
  47.             textFile = open(textFilename, "w", encoding='utf-8')  # make text file
  48.             textFile.write(text)  # write text to text file
  49.             textFile.close()
  50. # walk through all the sub-folders with specified level(depth) to look through
  51. def folders_in_path(path):
  52.     if not Path.is_dir(path):
  53.         raise ValueError("argument is not a directory")
  54.     yield from filter(Path.is_dir, path.iterdir())
  55.  
  56. def folders_in_depth(path, depth):
  57.     if 0 > depth:
  58.         raise ValueError("depth smaller 0")
  59.     if 0 == depth:
  60.         yield from folders_in_path(path)
  61.     else:
  62.         for folder in folders_in_path(path):
  63.             yield from folders_in_depth(folder, depth-1)
  64.  
  65. os.chdir(r"C:/Users/xxxx") #set the working directory
  66. #print(os.getcwd())
  67.  
  68. lst_folders=[]        
  69. evaluators=[]        
  70.  
  71. #get a list of all the folder names(evaluators)
  72. for folder in folders_in_depth(Path.cwd(),0):
  73.         lst_folders.append(folder)
  74.         evaluator=str(folder).split("\\")[-1]
  75.         evaluators.append(evaluator)
  76.        
  77. start_time = time.time() #start the timer at this point of time
  78. if __name__ == '__main__':
  79.     for foldername in lst_folders:                                    #iterate over each folder
  80.         evaluator=str(foldername).split("\\")[-1]
  81.         evaluator_folder_path = txtDir+'/'+evaluator
  82.        
  83.         try:                                                          #create a folder for each evaluator
  84.             os.mkdir(evaluator_folder_path)
  85.             print('folder %s has been created'%evaluator_folder_path) #mark the current folder under conversion
  86.         except OSError:  
  87.             print ("Creation of the directory %s failed" % foldername)
  88.            
  89.         convertMultiple(str(foldername), evaluator_folder_path)
  90.         print("--- %s seconds ---" % (time.time() - start_time))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top