Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- try:
- from PIL import Image
- except ImportError:
- import Image
- import textract
- import os
- class PdfMiner():
- path = os.getcwd() + '/folderForPdf/'
- output_path = os.getcwd() + '/output_results/'
- def __init__(self):
- pass
- def main(self):
- for self.filename in os.listdir(self.path):
- PdfMiner().extract_text_from_pdf(self.path + self.filename)
- def keyword_strike(self, text, keyword_strike_dict={}):
- '''The purpose of keyword_strike function is
- is to count how many times a specific keyword occured'''
- self.keyword_strike_dict = {}
- self.text = text
- self.keywords_list = PdfMiner().extract_keywords()
- for keyword in self.keywords_list:
- if keyword in text.decode('utf-8'):
- print("Keyword {} occured {} times".format(keyword, text.decode('utf-8').count(keyword)))
- self.keyword_strike_dict[keyword] = text.decode('utf-8').count(keyword)
- if bool(self.keyword_strike_dict):
- print('Not empty')
- print(PdfMiner().main().filename)
- def extract_keywords(self, keywords_list=None):
- '''The purpose of function extract_keywords
- is to extract the keywords we want to use,
- from file keywords.txt, into a list'''
- keywords_list = []
- with open('keywords.txt', 'r', encoding='utf8') as keywords_file:
- for keyword in keywords_file:
- keywords_list.append(keyword.strip('\n'))
- return keywords_list
- def extract_text_from_pdf(self, file_destination, text=None):
- '''The purpose of function extract_text_from_pdf
- is to extract the text of each page and add it to variable text'''
- self.file_destination = file_destination
- text = textract.process(self.file_destination, language='eng', encoding='utf-8')
- PdfMiner().keyword_strike(text)
- return text
- def output_to_csv(self):
- '''The purpose of function output_csv
- is to create a csv file with possitive pdf filename
- and inside it will contains 2 columns, first for keywords
- and second column how many times the keyword appeared'''
- if __name__ == "__main__":
- PdfMiner().main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement