Advertisement
Guest User

Untitled

a guest
May 20th, 2019
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.65 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author: Abdulla Muaz
  4. @desc : Retrieving corpus, extracting words
  5. """
  6.  
  7. import re # for regular expression
  8. import PyPDF2 # to read and extract pdf files
  9. import os # to fetch all files in the directory
  10. import nltk
  11.  
  12. from nltk.tokenize import TreebankWordTokenizer
  13. tokenizer = TreebankWordTokenizer()
  14.  
  15.  
  16.  
  17. text_data = [] # creates an empty array to store all extracted text
  18.  
  19. nltk.download('punkt')
  20. # fetches all files in the corpus directory
  21. files = os.listdir(os.getcwd()+'\\corpus')
  22. files
  23.  
  24.  
  25. # function to read one pdf at a time and extract all pages to be then appended to the text_data list
  26. def extractmyfiles(pdf):
  27. pdf = os.getcwd()+'\\corpus\\'+pdf
  28. with open(pdf, 'rb') as pdf_file:
  29. read_pdf = PyPDF2.PdfFileReader(pdf_file)
  30. number_of_pages = read_pdf.getNumPages()
  31. for page_number in range(number_of_pages):
  32. page = read_pdf.getPage(page_number)
  33. text_data.append(page.extractText())
  34. return
  35.  
  36.  
  37. # for every file in the files list, run the function with a parameter passed as file name
  38. for file in files:
  39. extractmyfiles(file)
  40.  
  41. # create empty array to store all the words
  42. word = []
  43.  
  44. for i in text_data:
  45. # ignores words that cause issue eg. \asas\aeae\asas
  46. i = i.encode('ascii', 'ignore')
  47. # split the words (extract only words using the regex pattern)
  48. words = re.findall(b"([a-zA-Z]+)", i)
  49. for w in words:
  50. word.append(w)
  51.  
  52.  
  53. print(len(word))
  54.  
  55. # wrtie to csv all the words
  56. with open('test.csv', 'w') as out_f:
  57. for l in word:
  58. out_f.write("\n"+l.decode("utf-8"))
  59.  
  60.  
  61. # plot the frequency distribution
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement