Advertisement
Guest User

Untitled

a guest
Nov 20th, 2017
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.72 KB | None | 0 0
  1.  
  2. # Import NLTK and stopwords method.
  3. import nltk
  4. from nltk.corpus import stopwords
  5. nltk.download('stopwords')
  6.  
  7. def main():
  8. # Prime the loop.
  9. input_file_name = ''
  10.  
  11. # Prompt user to enter the name of the input file.
  12. while input_file_name == '':
  13. try:
  14. input_file_name = str(input("Please enter the name of the input data file: "))
  15. input_file = open(input_file_name, 'r')
  16. readFileTokenize(input_file)
  17. # Tell user that file has been successfully processed.
  18. print("The input file has been successfully processed.")
  19. # Check that the file exists; if not, prompt user to reenter file name.
  20. except FileNotFoundError:
  21. input_file_name = ''
  22.  
  23. term_freq_document = {}
  24. term_freq_dict = {}
  25. term_freq_document = calDocFreq(hobbies_tokens, student_id)
  26. term_freq_dict = calTermFreq(hobbies_tokens)
  27. term_freq_document[student_id] = calTermFreq(hobbies_tokens)
  28.  
  29. # Prime the loop.
  30. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  31. "and document frequency. Enter blank if done. "))
  32.  
  33. # Prompt user to enter the name of the term they wish to find TF and DF.
  34. while input_term_name != '':
  35. try:
  36. # Present the information to the user for each TF and DF.
  37. print("Document Frequency of", input_term_name, ":", term_freq_document.get(input_term_name))
  38. print("Term Frequency (TF) are as follows.")
  39. for input_term_name in term_freq_dict:
  40. print("TF of", input_term_name, "in", term_freq_dict.keys(), "are:",
  41. term_freq_dict.get(input_term_name))
  42. print("All of the information has been shown.")
  43.  
  44. # Check that the word exists; if not, prompt user to reenter term name.
  45. except ValueError:
  46. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  47. "and document frequency. Enter blank if done. "))
  48.  
  49. except KeyError:
  50. print("The word cannot be found in the collection.")
  51. input_term_name = str(input("Please enter a word you wish to determine its term frequency "
  52. "and document frequency. Enter blank if done. "))
  53.  
  54. # Close the file.
  55. input_file.close()
  56.  
  57. def readFileTokenize(input_file):
  58. # Read the text in the input file. Prime the loop.
  59. student_id = input_file.readline()
  60. while student_id != '':
  61. hobby_descrp = input_file.readline()
  62. hobbies_tokens = nltk.word_tokenize(hobby_descrp)
  63. hobbies_tokens = removePeriodsCommas(hobbies_tokens)
  64. hobbies_tokens = convertToLower(hobbies_tokens)
  65. hobbies_tokens = removeStopWords(hobbies_tokens)
  66. # Prime the loop.
  67. student_id = input_file.readline()
  68. return hobbies_tokens, student_id
  69.  
  70. # Remove periods and commas from the descriptions.
  71. def removePeriodsCommas(hobbies_tokens):
  72. # Loop to remove all periods and tokens.
  73. while "." in hobbies_tokens:
  74. hobbies_tokens.remove(".")
  75. while "'" in hobbies_tokens:
  76. hobbies_tokens.remove("'")
  77. return hobbies_tokens
  78.  
  79. # Convert all descriptions to lower case.
  80. def convertToLower(hobbies_tokens):
  81. # Create an empty list.
  82. lower_tokens = []
  83.  
  84. # Loop to make all word tokens lower case.
  85. for token in hobbies_tokens:
  86. token = token.lower()
  87. lower_tokens.append(token)
  88. return lower_tokens
  89.  
  90. # Remove all stopwords from descriptions (use NLTK method).
  91. def removeStopWords(hobbies_tokens):
  92. # Use the NLTK stopwords method.
  93. stop_words = stopwords.words('english')
  94.  
  95. # Loop to remove stopwords.
  96. for token in hobbies_tokens:
  97. if token in stop_words:
  98. hobbies_tokens.remove(token)
  99. return hobbies_tokens
  100.  
  101. # Calculate the term frequency in each document and save in nested dictionary.
  102. def calTermFreq(hobbies_tokens):
  103. # Create an empty dictionary.
  104. term_freq_dict = {}
  105.  
  106. # Loop over each word token to calculate term frequency.
  107. for token in hobbies_tokens:
  108. if token in term_freq_dict:
  109. term_freq_dict[token] += 1
  110. else:
  111. term_freq_dict[token] = 1
  112.  
  113. return term_freq_dict
  114.  
  115. # Calculate the document frequency and save in outer dictionary.
  116. def calDocFreq(student_id, hobbies_tokens):
  117. # Create an empty dictionary.
  118. term_freq_document = {}
  119.  
  120. # Loop over each word to calculate document frequency. But I only want to do this for each student id?
  121. count = 0
  122. for id in student_id:
  123. if id in term_freq_document[id]:
  124.  
  125.  
  126. # Run the program.
  127. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement