Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from PyPDF2 import PdfReader
- from matplotlib import pyplot as plt
- file_path = './somePdf.pdf'
- reader = PdfReader(file_path, strict=False)
- number_of_pages = len(reader.pages)
- allRawText = ""
- for page in reader.pages:
- try:
- text = page.extract_text()
- except:
- continue
- allRawText+= text;
- allRawText =("".join(allRawText.split('\n'))).split(' ')
- notRawText = []
- for word in allRawText:
- notRawText.append(word.strip('=-.!/_'))
- countDict = {}
- s = "aNoiseSource"
- w = ''
- l = []
- d = list(filter(lambda x: len(x) > 2, notRawText))
- lastList = []
- for word in d:
- if(word[0].islower() and len(word)< 10):
- lastList.append(word)
- for word in lastList:
- countDict[word] = countDict.get(word, 0) + 1
- elementAmount = 10
- itemsFromDict = list(sorted(countDict.items(), key=lambda x:x[1], reverse=True))[:elementAmount]
- x_val = list(map(lambda x:x[0], itemsFromDict))
- y_val = list(map(lambda x:x[1], itemsFromDict))
- plt.plot(x_val, y_val)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment