Untitled

from PyPDF2 import PdfReader
from matplotlib import pyplot as plt
file_path = './somePdf.pdf'

reader = PdfReader(file_path, strict=False)
number_of_pages = len(reader.pages)

allRawText = ""
for page in reader.pages:
    try:
        text = page.extract_text()
    except:
        continue


    allRawText+= text;

allRawText =("".join(allRawText.split('\n'))).split(' ')
notRawText = []

for word in allRawText:
    notRawText.append(word.strip('=-.!/_'))


countDict = {}

s = "aNoiseSource"

w = ''
l = []

d = list(filter(lambda x: len(x) > 2, notRawText))

lastList = []
for word in d:
    if(word[0].islower() and len(word)< 10):
        lastList.append(word)
for word in lastList:
    countDict[word] = countDict.get(word, 0) + 1

elementAmount = 10
itemsFromDict = list(sorted(countDict.items(), key=lambda x:x[1], reverse=True))[:elementAmount]
x_val = list(map(lambda x:x[0], itemsFromDict))
y_val = list(map(lambda x:x[1], itemsFromDict))
plt.plot(x_val, y_val)
plt.show()