Advertisement
Guest User

Untitled

a guest
Oct 13th, 2019
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.79 KB | None | 0 0
  1. # Submitter: alyssat3(Tan, Alyssa)
  2. # Partner : chopkin(Hopkins, Courvoisier)
  3. # We certify that we worked cooperatively on this programming
  4. # assignment, according to the rules for pair programming
  5. import re # used in my sentence_at_a_time generator function
  6. import math # use in cosine_meteric
  7. import prompt # for use in script
  8. import goody # for use in script
  9. from collections import defaultdict # dicts and defaultdictsare == when they have the same keys/associations
  10.  
  11.  
  12. # For use in build_semantic_dictionary: see problem specifications
  13. def sentence_at_a_time(open_file : open, ignore_words : {str}) -> [str]:
  14. end_punct = re.compile('[.?\!;:]')
  15. remove_punct = re.compile(r'(,|\'|"|\*|\(|\)|--|'+chr(8220)+'|'+chr(8221)+')')
  16. prev = []
  17. answer = []
  18. for l in open_file:
  19. l = remove_punct.sub(' ',l.lower())
  20. prev = prev + l.split()
  21. while prev:
  22. w = prev.pop(0)
  23. if end_punct.search(w):
  24. while end_punct.search(w):
  25. w = w[0:-1]
  26. if w != '' and w not in ignore_words:
  27. if end_punct.search(w):
  28. print(w)
  29. answer.append(w)
  30. yield answer
  31. answer = []
  32. else:
  33. if w != '' and w not in ignore_words:
  34. answer.append(w)
  35.  
  36. # handle special case of last sentence missing final punctuation
  37. if answer:
  38. yield answer
  39.  
  40.  
  41. def build_semantic_dictionary(training_files : [open], ignore_file : open) -> {str:{str:int}}:
  42. ignored = set()
  43. returnDict = defaultdict(lambda :defaultdict(int))
  44. for i in ignore_file:
  45. i = i.rstrip()
  46. ignored.add(i)
  47. for line in training_files:
  48. for s in sentence_at_a_time(line, ignored):
  49. for word in s:
  50. for other in s:
  51. if other != word:
  52. returnDict[word][other] += 1
  53. line.close()
  54. ignore_file.close()
  55. return returnDict
  56.  
  57. def dict_as_str(semantic : {str:{str:int}}) -> str:
  58. returnStr = ""
  59. for k, v in sorted(semantic.items()):
  60. returnStr += " context for " + k + " ="
  61. for key, value in sorted(v.items()):
  62. returnStr += " " + key + "@" + str(value) + ","
  63. returnStr = returnStr[:-1]
  64. returnStr += "\n"
  65. minimum = min([len(i) for i in semantic.values()])
  66. maximum = max([len(i) for i in semantic.values()])
  67. returnStr += " min/max context lengths = " + str(minimum) + "/" + str(maximum) + "\n"
  68. return returnStr
  69.  
  70.  
  71. def cosine_metric(context1 : {str:int}, context2 : {str:int}) -> float:
  72. if context1 == {} or context2 == {}:
  73. raise ZeroDivisionError("Not divisible by 0")
  74. top = 0
  75. num = 0
  76. for i in context1.keys():
  77. top += context1[i] * context2.get(i,0)
  78. num += context1[i] ** 2
  79. bottom = math.sqrt(num)
  80. num = 0
  81. for i in context2.values():
  82. num += i ** 2
  83. bottom *= math.sqrt(num)
  84. return top/bottom
  85.  
  86. def most_similar(word : str, choices : [str], semantic : {str:{str:int}}, metric : callable) -> str:
  87. numberList = []
  88. for i in choices:
  89. numberList.append(metric(semantic[word],semantic[i]))
  90. return choices[numberList.index(max(numberList))]
  91.  
  92.  
  93. def similarity_test(test_file : open, semantic : {str:{str:int}}, metric : callable) -> str:
  94. returnStr= ""
  95. correct = 0
  96. incorrect = 0
  97. for line in test_file:
  98. line = line.rstrip("\n")
  99. wordList = line.split()
  100. word = wordList[0]
  101. answer = wordList[-1]
  102. checkList = wordList[1:len(wordList) - 1]
  103. try:
  104. choice = most_similar(word, checkList, semantic, metric)
  105. if choice == answer:
  106. returnStr += " Correct: '" + word + "' is most like '" + choice + "' from " + str(checkList) + "\n"
  107. correct += 1
  108. else:
  109. returnStr += " Incorrect: '" + word + "' is most like '" + answer + "', not '" + choice + "' from " + str(checkList) + "\n"
  110. incorrect += 1
  111. except ZeroDivisionError:
  112. returnStr += " Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
  113. incorrect += 1
  114. except KeyError:
  115. returnStr += " Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
  116. incorrect += 1
  117. test_file.close()
  118. returnStr += str(round((correct / (incorrect + correct)) * 100, 1)) + "% correct\n"
  119. return returnStr
  120.  
  121. # Script
  122.  
  123. if __name__ == '__main__':
  124. # Write script here
  125. fileName = "file"
  126. files = []
  127. while fileName != "":
  128. fileName = input("Enter name of text file for training (no-more to start processing)[no-more]: ")
  129. if(fileName == ""):
  130. break
  131. try:
  132. files.append(open(fileName))
  133. except IOError:
  134. print(" file named",fileName, "rejected: cannot be opened")
  135. wordDict = build_semantic_dictionary(files, open("ignore_words.txt"))
  136. printOrNot = input("Print Semantic dictionary?[False]:")
  137. if(printOrNot == "True"):
  138. print(dict_as_str(wordDict))
  139. similarity_test(goody.safe_open("Enter name of problem file[synonym-problems.txt]: ", "r", "cannot be opened"),wordDict,cosine_metric)
  140. # For running batch self-tests
  141. print()
  142. import driver
  143. driver.default_file_name = "bsc5.txt"
  144. # driver.default_show_traceback = True
  145. # driver.default_show_exception = True
  146. # driver.default_show_exception_message = True
  147. driver.driver()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement