Untitled


# Import NLTK and stopwords method.
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def main():
    # Prime the loop.
    input_file_name = ''

    # Prompt user to enter the name of the input file.
    while input_file_name == '':
        try:
            input_file_name = str(input("Please enter the name of the input data file: "))
            input_file = open(input_file_name, 'r')
            readFileTokenize(input_file)
            # Tell user that file has been successfully processed.
            print("The input file has been successfully processed.")
        # Check that the file exists; if not, prompt user to reenter file name.
        except FileNotFoundError:
            input_file_name = ''

    term_freq_document = {}
    term_freq_dict = {}
    term_freq_document = calDocFreq(hobbies_tokens, student_id)
    term_freq_dict = calTermFreq(hobbies_tokens)
    term_freq_document[student_id] = calTermFreq(hobbies_tokens)

    # Prime the loop.
    input_term_name = str(input("Please enter a word you wish to determine its term frequency "
                                "and document frequency. Enter blank if done. "))

    # Prompt user to enter the name of the term they wish to find TF and DF.
    while input_term_name != '':
        try:
            # Present the information to the user for each TF and DF.
            print("Document Frequency of", input_term_name, ":", term_freq_document.get(input_term_name))
            print("Term Frequency (TF) are as follows.")
            for input_term_name in term_freq_dict:
                print("TF of", input_term_name, "in", term_freq_dict.keys(), "are:",
                  term_freq_dict.get(input_term_name))
            print("All of the information has been shown.")

        # Check that the word exists; if not, prompt user to reenter term name.
        except ValueError:
            input_term_name = str(input("Please enter a word you wish to determine its term frequency "
                                        "and document frequency. Enter blank if done. "))

        except KeyError:
            print("The word cannot be found in the collection.")
            input_term_name = str(input("Please enter a word you wish to determine its term frequency "
                                        "and document frequency. Enter blank if done. "))

    # Close the file.
    input_file.close()

def readFileTokenize(input_file):
    # Read the text in the input file. Prime the loop.
    student_id = input_file.readline()
    while student_id != '':
        hobby_descrp = input_file.readline()
        hobbies_tokens = nltk.word_tokenize(hobby_descrp)
        hobbies_tokens = removePeriodsCommas(hobbies_tokens)
        hobbies_tokens = convertToLower(hobbies_tokens)
        hobbies_tokens = removeStopWords(hobbies_tokens)
        # Prime the loop.
        student_id = input_file.readline()
        return hobbies_tokens, student_id

# Remove periods and commas from the descriptions.
def removePeriodsCommas(hobbies_tokens):
    # Loop to remove all periods and tokens.
    while "." in hobbies_tokens:
        hobbies_tokens.remove(".")
    while "'" in hobbies_tokens:
        hobbies_tokens.remove("'")
    return hobbies_tokens

# Convert all descriptions to lower case.
def convertToLower(hobbies_tokens):
    # Create an empty list.
    lower_tokens = []

    # Loop to make all word tokens lower case.
    for token in hobbies_tokens:
        token = token.lower()
        lower_tokens.append(token)
    return lower_tokens

# Remove all stopwords from descriptions (use NLTK method).
def removeStopWords(hobbies_tokens):
    # Use the NLTK stopwords method.
    stop_words = stopwords.words('english')

    # Loop to remove stopwords.
    for token in hobbies_tokens:
        if token in stop_words:
            hobbies_tokens.remove(token)
    return hobbies_tokens

# Calculate the term frequency in each document and save in nested dictionary.
def calTermFreq(hobbies_tokens):
    # Create an empty dictionary.
    term_freq_dict = {}

    # Loop over each word token to calculate term frequency.
    for token in hobbies_tokens:
        if token in term_freq_dict:
            term_freq_dict[token] += 1
        else:
            term_freq_dict[token] = 1

    return term_freq_dict

# Calculate the document frequency and save in outer dictionary.
def calDocFreq(student_id, hobbies_tokens):
    # Create an empty dictionary.
    term_freq_document = {}

    # Loop over each word to calculate document frequency. But I only want to do this for each student id?
    count = 0
    for id in student_id:
        if id in term_freq_document[id]:


# Run the program.
main()