Advertisement
Guest User

Untitled

a guest
May 19th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.26 KB | None | 0 0
  1. try:
  2.     from PIL import Image
  3. except ImportError:
  4.     import Image
  5. import textract
  6. import os
  7.  
  8. class PdfMiner():
  9.  
  10.     path = os.getcwd() + '/folderForPdf/'
  11.     output_path = os.getcwd() + '/output_results/'
  12.  
  13.     def __init__(self):
  14.         pass
  15.  
  16.     def main(self):
  17.         for self.filename in os.listdir(self.path):
  18.             PdfMiner().extract_text_from_pdf(self.path + self.filename)
  19.  
  20.  
  21.     def keyword_strike(self, text, keyword_strike_dict={}):
  22.         '''The purpose of keyword_strike function is
  23.        is to count how many times a specific keyword occured'''
  24.         self.keyword_strike_dict = {}
  25.         self.text = text
  26.         self.keywords_list = PdfMiner().extract_keywords()
  27.         for keyword in self.keywords_list:
  28.             if keyword in text.decode('utf-8'):
  29.                 print("Keyword {} occured {} times".format(keyword, text.decode('utf-8').count(keyword)))
  30.                 self.keyword_strike_dict[keyword] = text.decode('utf-8').count(keyword)
  31.         if bool(self.keyword_strike_dict):
  32.             print('Not empty')
  33.             print(PdfMiner().main().filename)
  34.  
  35.     def extract_keywords(self, keywords_list=None):
  36.         '''The purpose of function extract_keywords
  37.        is to extract the keywords we want to use,
  38.        from file keywords.txt, into a list'''
  39.         keywords_list = []
  40.         with open('keywords.txt', 'r', encoding='utf8') as keywords_file:
  41.             for keyword in keywords_file:
  42.                 keywords_list.append(keyword.strip('\n'))
  43.         return keywords_list
  44.  
  45.     def extract_text_from_pdf(self, file_destination, text=None):
  46.         '''The purpose of function extract_text_from_pdf
  47.        is to extract the text of each page and add it to variable text'''
  48.         self.file_destination = file_destination
  49.         text = textract.process(self.file_destination, language='eng', encoding='utf-8')
  50.         PdfMiner().keyword_strike(text)
  51.         return text
  52.  
  53.     def output_to_csv(self):
  54.         '''The purpose of function output_csv
  55.        is to create a csv file with possitive pdf filename
  56.        and inside it will contains 2 columns, first for keywords
  57.        and second column how many times the keyword appeared'''
  58.  
  59.  
  60.  
  61. if __name__ == "__main__":
  62.     PdfMiner().main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement