Advertisement
Guest User

Untitled

a guest
Aug 20th, 2019
141
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.40 KB | None | 0 0
  1. from io import StringIO
  2. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  3. from pdfminer.converter import TextConverter
  4. from pdfminer.layout import LAParams
  5. from pdfminer.pdfpage import PDFPage
  6. import os
  7. #import sys, getopt
  8. from pathlib import Path
  9. import re
  10. import time
  11.  
  12. def convert(fname, pages=None):
  13. #Read a pdf, returns its text content as a string
  14. if not pages:
  15. pagenums = set()
  16. else:
  17. pagenums = set(pages)
  18.  
  19. output = StringIO()
  20. manager = PDFResourceManager()
  21. converter = TextConverter(manager, output, laparams=LAParams())
  22. interpreter = PDFPageInterpreter(manager, converter)
  23.  
  24. infile = open(fname, 'rb')
  25.  
  26. for page in PDFPage.get_pages(infile, pagenums,check_extractable=False):
  27. interpreter.process_page(page)
  28. infile.close()
  29. converter.close()
  30. text = output.getvalue()
  31. output.close
  32. return text
  33.  
  34. def convertMultiple(pdfDir, txtDir):
  35. if pdfDir == "": pdfDir = os.getcwd() + "/" # if no pdfDir passed in
  36. for pdf in Path(pdfDir).glob(r'**/*.pdf'): # iterate through pdfs in pdf directory
  37. if '-' in str(pdf): #rename the file if it has "-" in the firename
  38. pdf_legal=str(pdf).replace('-','_')
  39. os.rename(str(pdf),str(pdf_legal))
  40. else:
  41. pdf_legal=pdf
  42. fileExtension = str(pdf).split(".")[-1] #i.e. extension after the dot
  43. if fileExtension == "pdf":
  44. pdfFilename = str(pdf_legal)
  45. text = convert(pdfFilename) #get string of text content of pdf
  46. textFilename = ((txtDir +'/'+ str(pdf).split('\\')[-1]).rstrip(".pdf"))+ ".txt"
  47. textFile = open(textFilename, "w", encoding='utf-8') # make text file
  48. textFile.write(text) # write text to text file
  49. textFile.close()
  50. # walk through all the sub-folders with specified level(depth) to look through
  51. def folders_in_path(path):
  52. if not Path.is_dir(path):
  53. raise ValueError("argument is not a directory")
  54. yield from filter(Path.is_dir, path.iterdir())
  55.  
  56. def folders_in_depth(path, depth):
  57. if 0 > depth:
  58. raise ValueError("depth smaller 0")
  59. if 0 == depth:
  60. yield from folders_in_path(path)
  61. else:
  62. for folder in folders_in_path(path):
  63. yield from folders_in_depth(folder, depth-1)
  64.  
  65. os.chdir(r"C:/Users/xxxx") #set the working directory
  66. #print(os.getcwd())
  67.  
  68. lst_folders=[]
  69. evaluators=[]
  70.  
  71. #get a list of all the folder names(evaluators)
  72. for folder in folders_in_depth(Path.cwd(),0):
  73. lst_folders.append(folder)
  74. evaluator=str(folder).split("\\")[-1]
  75. evaluators.append(evaluator)
  76.  
  77. start_time = time.time() #start the timer at this point of time
  78. if __name__ == '__main__':
  79. for foldername in lst_folders: #iterate over each folder
  80. evaluator=str(foldername).split("\\")[-1]
  81. evaluator_folder_path = txtDir+'/'+evaluator
  82.  
  83. try: #create a folder for each evaluator
  84. os.mkdir(evaluator_folder_path)
  85. print('folder %s has been created'%evaluator_folder_path) #mark the current folder under conversion
  86. except OSError:
  87. print ("Creation of the directory %s failed" % foldername)
  88.  
  89. convertMultiple(str(foldername), evaluator_folder_path)
  90. print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement