Guest User

Untitled

a guest
Oct 2nd, 2022
35
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.01 KB | None | 0 0
  1. from PyPDF2 import PdfReader
  2. from matplotlib import pyplot as plt
  3. file_path = './somePdf.pdf'
  4.  
  5. reader = PdfReader(file_path, strict=False)
  6. number_of_pages = len(reader.pages)
  7.  
  8. allRawText = ""
  9. for page in reader.pages:
  10.     try:
  11.         text = page.extract_text()
  12.     except:
  13.         continue
  14.  
  15.  
  16.     allRawText+= text;
  17.  
  18. allRawText =("".join(allRawText.split('\n'))).split(' ')
  19. notRawText = []
  20.  
  21. for word in allRawText:
  22.     notRawText.append(word.strip('=-.!/_'))
  23.  
  24.  
  25. countDict = {}
  26.  
  27. s = "aNoiseSource"
  28.  
  29. w = ''
  30. l = []
  31.  
  32. d = list(filter(lambda x: len(x) > 2, notRawText))
  33.  
  34. lastList = []
  35. for word in d:
  36.     if(word[0].islower() and len(word)< 10):
  37.         lastList.append(word)
  38. for word in lastList:
  39.     countDict[word] = countDict.get(word, 0) + 1
  40.  
  41. elementAmount = 10
  42. itemsFromDict = list(sorted(countDict.items(), key=lambda x:x[1], reverse=True))[:elementAmount]
  43. x_val = list(map(lambda x:x[0], itemsFromDict))
  44. y_val = list(map(lambda x:x[1], itemsFromDict))
  45. plt.plot(x_val, y_val)
  46. plt.show()
  47.  
Advertisement
Add Comment
Please, Sign In to add comment