Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Import NLTK and stopwords method.
- import nltk
- from nltk.corpus import stopwords
- nltk.download('stopwords')
- def main():
- # Prime the loop.
- input_file_name = ''
- # Prompt user to enter the name of the input file.
- while input_file_name == '':
- try:
- input_file_name = str(input("Please enter the name of the input data file: "))
- input_file = open(input_file_name, 'r')
- term_frequency_dict, doc_frequency_dict = readFileTokenize(input_file)
- # Tell user that file has been successfully processed.
- print("The input file has been successfully processed.")
- # Check that the file exists; if not, prompt user to reenter file name.
- except FileNotFoundError:
- input_file_name = ''
- # Call the function to search for TF and DF.
- search(term_frequency_dict, doc_frequency_dict)
- # Close the file.
- input_file.close()
- # Run the program.
- main()
- # Search for the term that the user wants to find TF and DF for.
- def search(term_frequency_dict, doc_frequency_dict):
- # Prime the loop.
- input_term_name = str(input("Please enter a word you wish to determine its term frequency "
- "and document frequency. Enter blank if done. "))
- # Prompt user to enter the name of the term they wish to find TF and DF.
- while input_term_name != '':
- try:
- # Present the information to the user for each TF and DF.
- print("Document Frequency of", input_term_name, ":", doc_frequency_dict.get())
- print("Term Frequency (TF) are as follows.")
- for input_term_name in term_frequency_dict:
- if input_term_name in term_frequency_dict[input_term_name]:
- print("TF of", input_term_name, "in", term_frequency_dict.keys(), "are:",
- term_frequency_dict.get(input_term_name))
- print("All of the information has been shown.")
- # Check that the word exists; if not, prompt user to reenter term name.
- except ValueError:
- input_term_name = str(input("Please enter a word you wish to determine its term frequency "
- "and document frequency. Enter blank if done. "))
- except KeyError:
- print("The word cannot be found in the collection.")
- input_term_name = str(input("Please enter a word you wish to determine its term frequency "
- "and document frequency. Enter blank if done. "))
- def readFileTokenize(input_file):
- # Read the text in the input file. Prime the loop and create empty dictionaries.
- student_id = input_file.readline().rstrip('/n')
- term_frequency_dict = {}
- doc_frequency_dict = {}
- # Read each line to process the hobby text.
- while student_id != '':
- hobby_descrp = input_file.readline()
- hobbies_tokens = nltk.word_tokenize(hobby_descrp)
- hobbies_tokens = removePeriodsCommas(hobbies_tokens)
- hobbies_tokens = convertToLower(hobbies_tokens)
- hobbies_tokens = removeStopWords(hobbies_tokens)
- term_inner_dict = calTermFreq(hobbies_tokens)
- term_frequency_dict[student_id] = term_inner_dict
- doc_frequency_dict = calDocFreq(doc_frequency_dict, term_inner_dict)
- # Prime the loop.
- student_id = input_file.readline().rstrip('/n')
- return term_frequency_dict, doc_frequency_dict
- # Remove periods and commas from the descriptions.
- def removePeriodsCommas(hobbies_tokens):
- # Loop to remove all periods and tokens.
- while "." in hobbies_tokens:
- hobbies_tokens.remove(".")
- while "," in hobbies_tokens:
- hobbies_tokens.remove(",")
- return hobbies_tokens
- # Convert all descriptions to lower case.
- def convertToLower(hobbies_tokens):
- # Create an empty list.
- lower_tokens = []
- # Loop to make all word tokens lower case.
- for token in hobbies_tokens:
- token = token.lower()
- lower_tokens.append(token)
- return lower_tokens
- # Remove all stopwords from descriptions (use NLTK method).
- def removeStopWords(hobbies_tokens):
- # Use the NLTK stopwords method.
- stop_words = stopwords.words('english')
- # Loop to remove stopwords.
- for token in hobbies_tokens:
- if token in stop_words:
- hobbies_tokens.remove(token)
- return hobbies_tokens
- # Calculate the term frequency in each document and save in nested dictionary.
- def calTermFreq(hobbies_tokens):
- # Create an empty dictionary.
- term_freq_dict = {}
- # Loop over each word token to calculate term frequency.
- for token in hobbies_tokens:
- if token in term_freq_dict:
- term_freq_dict[token] += 1
- else:
- term_freq_dict[token] = 1
- return term_freq_dict
- # Calculate the document frequency and save in outer dictionary.
- def calDocFreq(doc_frequency_dict, term_inner_dict):
- # Loop over each word to calculate document frequency.
- for token in term_inner_dict:
- if token in doc_frequency_dict:
- doc_frequency_dict[token] += 1
- else:
- doc_frequency_dict[token] = 1
- return doc_frequency_dict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement