Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- @author: Abdulla Muaz
- @desc : Retrieving corpus, extracting words
- """
- import re # for regular expression
- import PyPDF2 # to read and extract pdf files
- import os # to fetch all files in the directory
- import nltk
- from nltk.tokenize import TreebankWordTokenizer
- tokenizer = TreebankWordTokenizer()
- text_data = [] # creates an empty array to store all extracted text
- nltk.download('punkt')
- # fetches all files in the corpus directory
- files = os.listdir(os.getcwd()+'\\corpus')
- files
- # function to read one pdf at a time and extract all pages to be then appended to the text_data list
- def extractmyfiles(pdf):
- pdf = os.getcwd()+'\\corpus\\'+pdf
- with open(pdf, 'rb') as pdf_file:
- read_pdf = PyPDF2.PdfFileReader(pdf_file)
- number_of_pages = read_pdf.getNumPages()
- for page_number in range(number_of_pages):
- page = read_pdf.getPage(page_number)
- text_data.append(page.extractText())
- return
- # for every file in the files list, run the function with a parameter passed as file name
- for file in files:
- extractmyfiles(file)
- # create empty array to store all the words
- word = []
- for i in text_data:
- # ignores words that cause issue eg. \asas\aeae\asas
- i = i.encode('ascii', 'ignore')
- # split the words (extract only words using the regex pattern)
- words = re.findall(b"([a-zA-Z]+)", i)
- for w in words:
- word.append(w)
- print(len(word))
- # wrtie to csv all the words
- with open('test.csv', 'w') as out_f:
- for l in word:
- out_f.write("\n"+l.decode("utf-8"))
- # plot the frequency distribution
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement