Advertisement
Guest User

Untitled

a guest
May 19th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.19 KB | None | 0 0
  1. import textract
  2. import os
  3.  
  4. class PdfMiner():
  5.  
  6.     path = os.getcwd() + '/folderForPdf/'
  7.     output_path = os.getcwd() + '/output_results/'
  8.  
  9.     def __init__(self):
  10.         pass
  11.  
  12.     def main(self):
  13.         for self.filename in os.listdir(self.path):
  14.             PdfMiner().extract_text_from_pdf(self.path + self.filename)
  15.  
  16.  
  17.     def keyword_strike(self, text, keyword_strike_dict={}):
  18.         '''The purpose of keyword_strike function is
  19.        is to count how many times a specific keyword occured'''
  20.         self.keyword_strike_dict = {}
  21.         self.text = text
  22.         self.keywords_list = PdfMiner().extract_keywords()
  23.         for keyword in self.keywords_list:
  24.             if keyword in text.decode('utf-8'):
  25.                 print("Keyword {} occured {} times".format(keyword, text.decode('utf-8').count(keyword)))
  26.                 self.keyword_strike_dict[keyword] = text.decode('utf-8').count(keyword)
  27.         if bool(self.keyword_strike_dict):
  28.             print('Not empty')
  29.             print(PdfMiner().main().filename)
  30.  
  31.     def extract_keywords(self, keywords_list=None):
  32.         '''The purpose of function extract_keywords
  33.        is to extract the keywords we want to use,
  34.        from file keywords.txt, into a list'''
  35.         keywords_list = []
  36.         with open('keywords.txt', 'r', encoding='utf8') as keywords_file:
  37.             for keyword in keywords_file:
  38.                 keywords_list.append(keyword.strip('\n'))
  39.         return keywords_list
  40.  
  41.     def extract_text_from_pdf(self, file_destination, text=None):
  42.         '''The purpose of function extract_text_from_pdf
  43.        is to extract the text of each page and add it to variable text'''
  44.         self.file_destination = file_destination
  45.         text = textract.process(self.file_destination, language='eng', encoding='utf-8')
  46.         PdfMiner().keyword_strike(text)
  47.         return text
  48.  
  49.     def output_to_csv(self):
  50.         '''The purpose of function output_csv
  51.        is to create a csv file with possitive pdf filename
  52.        and inside it will contains 2 columns, first for keywords
  53.        and second column how many times the keyword appeared'''
  54.  
  55.  
  56.  
  57. if __name__ == "__main__":
  58.     PdfMiner().main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement