Advertisement
Guest User

Untitled

a guest
Nov 21st, 2017
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.21 KB | None | 0 0
  1. # Import NLTK and stopwords method.
  2. import nltk
  3. from nltk.corpus import stopwords
  4. nltk.download('stopwords')
  5.  
  6. def main():
  7. # Prime the loop.
  8. input_file_name = ''
  9.  
  10. # Prompt user to enter the name of the input file.
  11. while input_file_name == '':
  12. try:
  13. input_file_name = str(input("Please enter the name of the input data file: "))
  14. input_file = open(input_file_name, 'r')
  15. term_frequency_dict, doc_frequency_dict = readFileTokenize(input_file)
  16. # Tell user that file has been successfully processed.
  17. print("The input file has been successfully processed.")
  18. # Check that the file exists; if not, prompt user to reenter file name.
  19. except FileNotFoundError:
  20. input_file_name = ''
  21.  
  22. # Call the function to search for TF and DF.
  23. search(term_frequency_dict, doc_frequency_dict)
  24.  
  25. # Close the file.
  26. input_file.close()
  27.  
  28. # Run the program.
  29. main()
  30.  
  31. # Search for the term that the user wants to find TF and DF for.
  32. def search(term_frequency_dict, doc_frequency_dict):
  33. # Prime the loop.
  34. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  35. "and document frequency. Enter blank if done. "))
  36.  
  37. # Prompt user to enter the name of the term they wish to find TF and DF.
  38. while input_term_name != '':
  39. try:
  40. # Present the information to the user for each TF and DF.
  41. print("Document Frequency of", input_term_name, ":", doc_frequency_dict.get())
  42. print("Term Frequency (TF) are as follows.")
  43. for input_term_name in term_frequency_dict:
  44. if input_term_name in term_frequency_dict[input_term_name]:
  45. print("TF of", input_term_name, "in", term_frequency_dict.keys(), "are:",
  46. term_frequency_dict.get(input_term_name))
  47. print("All of the information has been shown.")
  48.  
  49. # Check that the word exists; if not, prompt user to reenter term name.
  50. except ValueError:
  51. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  52. "and document frequency. Enter blank if done. "))
  53.  
  54. except KeyError:
  55. print("The word cannot be found in the collection.")
  56. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  57. "and document frequency. Enter blank if done. "))
  58.  
  59. def readFileTokenize(input_file):
  60. # Read the text in the input file. Prime the loop and create empty dictionaries.
  61. student_id = input_file.readline().rstrip('/n')
  62. term_frequency_dict = {}
  63. doc_frequency_dict = {}
  64.  
  65. # Read each line to process the hobby text.
  66. while student_id != '':
  67. hobby_descrp = input_file.readline()
  68. hobbies_tokens = nltk.word_tokenize(hobby_descrp)
  69. hobbies_tokens = removePeriodsCommas(hobbies_tokens)
  70. hobbies_tokens = convertToLower(hobbies_tokens)
  71. hobbies_tokens = removeStopWords(hobbies_tokens)
  72. term_inner_dict = calTermFreq(hobbies_tokens)
  73. term_frequency_dict[student_id] = term_inner_dict
  74. doc_frequency_dict = calDocFreq(doc_frequency_dict, term_inner_dict)
  75.  
  76. # Prime the loop.
  77. student_id = input_file.readline().rstrip('/n')
  78. return term_frequency_dict, doc_frequency_dict
  79.  
  80. # Remove periods and commas from the descriptions.
  81. def removePeriodsCommas(hobbies_tokens):
  82. # Loop to remove all periods and tokens.
  83. while "." in hobbies_tokens:
  84. hobbies_tokens.remove(".")
  85. while "," in hobbies_tokens:
  86. hobbies_tokens.remove(",")
  87. return hobbies_tokens
  88.  
  89. # Convert all descriptions to lower case.
  90. def convertToLower(hobbies_tokens):
  91. # Create an empty list.
  92. lower_tokens = []
  93.  
  94. # Loop to make all word tokens lower case.
  95. for token in hobbies_tokens:
  96. token = token.lower()
  97. lower_tokens.append(token)
  98. return lower_tokens
  99.  
  100. # Remove all stopwords from descriptions (use NLTK method).
  101. def removeStopWords(hobbies_tokens):
  102. # Use the NLTK stopwords method.
  103. stop_words = stopwords.words('english')
  104.  
  105. # Loop to remove stopwords.
  106. for token in hobbies_tokens:
  107. if token in stop_words:
  108. hobbies_tokens.remove(token)
  109. return hobbies_tokens
  110.  
  111. # Calculate the term frequency in each document and save in nested dictionary.
  112. def calTermFreq(hobbies_tokens):
  113. # Create an empty dictionary.
  114. term_freq_dict = {}
  115.  
  116. # Loop over each word token to calculate term frequency.
  117. for token in hobbies_tokens:
  118. if token in term_freq_dict:
  119. term_freq_dict[token] += 1
  120. else:
  121. term_freq_dict[token] = 1
  122. return term_freq_dict
  123.  
  124. # Calculate the document frequency and save in outer dictionary.
  125. def calDocFreq(doc_frequency_dict, term_inner_dict):
  126.  
  127. # Loop over each word to calculate document frequency.
  128. for token in term_inner_dict:
  129. if token in doc_frequency_dict:
  130. doc_frequency_dict[token] += 1
  131. else:
  132. doc_frequency_dict[token] = 1
  133. return doc_frequency_dict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement