Untitled

# -*- coding: utf-8 -*-
"""
@author: Abdulla Muaz
@desc  : Retrieving corpus, extracting words
"""

import re  # for regular expression
import PyPDF2  # to read and extract pdf files
import os  # to fetch all files in the directory
import nltk

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()


text_data = []  # creates an empty array to store all extracted text

nltk.download('punkt')
# fetches all files in the corpus directory
files = os.listdir(os.getcwd()+'\\corpus')
files


# function to read one pdf at a time and extract all pages to be then appended to the text_data list
def extractmyfiles(pdf):
    pdf = os.getcwd()+'\\corpus\\'+pdf
    with open(pdf, 'rb') as pdf_file:
        read_pdf = PyPDF2.PdfFileReader(pdf_file)
        number_of_pages = read_pdf.getNumPages()
        for page_number in range(number_of_pages):
            page = read_pdf.getPage(page_number)
            text_data.append(page.extractText())
    return


# for every file in the files list, run the function with a parameter passed as file name
for file in files:
    extractmyfiles(file)

# create empty array to store all the words
word = []

for i in text_data:
    # ignores words that cause issue eg. \asas\aeae\asas
    i = i.encode('ascii', 'ignore')
    # split the words (extract only words using the regex pattern)
    words = re.findall(b"([a-zA-Z]+)", i)
    for w in words:
        word.append(w)


print(len(word))

# wrtie to csv all the words
with open('test.csv', 'w') as out_f:
    for l in word:
        out_f.write("\n"+l.decode("utf-8"))


# plot the frequency distribution