Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Submitter: alyssat3(Tan, Alyssa)
- # Partner : chopkin(Hopkins, Courvoisier)
- # We certify that we worked cooperatively on this programming
- # assignment, according to the rules for pair programming
- import re # used in my sentence_at_a_time generator function
- import math # use in cosine_meteric
- import prompt # for use in script
- import goody # for use in script
- from collections import defaultdict # dicts and defaultdictsare == when they have the same keys/associations
- # For use in build_semantic_dictionary: see problem specifications
- def sentence_at_a_time(open_file : open, ignore_words : {str}) -> [str]:
- end_punct = re.compile('[.?\!;:]')
- remove_punct = re.compile(r'(,|\'|"|\*|\(|\)|--|'+chr(8220)+'|'+chr(8221)+')')
- prev = []
- answer = []
- for l in open_file:
- l = remove_punct.sub(' ',l.lower())
- prev = prev + l.split()
- while prev:
- w = prev.pop(0)
- if end_punct.search(w):
- while end_punct.search(w):
- w = w[0:-1]
- if w != '' and w not in ignore_words:
- if end_punct.search(w):
- print(w)
- answer.append(w)
- yield answer
- answer = []
- else:
- if w != '' and w not in ignore_words:
- answer.append(w)
- # handle special case of last sentence missing final punctuation
- if answer:
- yield answer
- def build_semantic_dictionary(training_files : [open], ignore_file : open) -> {str:{str:int}}:
- ignored = set()
- returnDict = defaultdict(lambda :defaultdict(int))
- for i in ignore_file:
- i = i.rstrip()
- ignored.add(i)
- for line in training_files:
- for s in sentence_at_a_time(line, ignored):
- for word in s:
- for other in s:
- if other != word:
- returnDict[word][other] += 1
- line.close()
- ignore_file.close()
- return returnDict
- def dict_as_str(semantic : {str:{str:int}}) -> str:
- returnStr = ""
- for k, v in sorted(semantic.items()):
- returnStr += " context for " + k + " ="
- for key, value in sorted(v.items()):
- returnStr += " " + key + "@" + str(value) + ","
- returnStr = returnStr[:-1]
- returnStr += "\n"
- minimum = min([len(i) for i in semantic.values()])
- maximum = max([len(i) for i in semantic.values()])
- returnStr += " min/max context lengths = " + str(minimum) + "/" + str(maximum) + "\n"
- return returnStr
- def cosine_metric(context1 : {str:int}, context2 : {str:int}) -> float:
- if context1 == {} or context2 == {}:
- raise ZeroDivisionError("Not divisible by 0")
- top = 0
- num = 0
- for i in context1.keys():
- top += context1[i] * context2.get(i,0)
- num += context1[i] ** 2
- bottom = math.sqrt(num)
- num = 0
- for i in context2.values():
- num += i ** 2
- bottom *= math.sqrt(num)
- return top/bottom
- def most_similar(word : str, choices : [str], semantic : {str:{str:int}}, metric : callable) -> str:
- numberList = []
- for i in choices:
- numberList.append(metric(semantic[word],semantic[i]))
- return choices[numberList.index(max(numberList))]
- def similarity_test(test_file : open, semantic : {str:{str:int}}, metric : callable) -> str:
- returnStr= ""
- correct = 0
- incorrect = 0
- for line in test_file:
- line = line.rstrip("\n")
- wordList = line.split()
- word = wordList[0]
- answer = wordList[-1]
- checkList = wordList[1:len(wordList) - 1]
- try:
- choice = most_similar(word, checkList, semantic, metric)
- if choice == answer:
- returnStr += " Correct: '" + word + "' is most like '" + choice + "' from " + str(checkList) + "\n"
- correct += 1
- else:
- returnStr += " Incorrect: '" + word + "' is most like '" + answer + "', not '" + choice + "' from " + str(checkList) + "\n"
- incorrect += 1
- except ZeroDivisionError:
- returnStr += " Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
- incorrect += 1
- except KeyError:
- returnStr += " Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
- incorrect += 1
- test_file.close()
- returnStr += str(round((correct / (incorrect + correct)) * 100, 1)) + "% correct\n"
- return returnStr
- # Script
- if __name__ == '__main__':
- # Write script here
- fileName = "file"
- files = []
- while fileName != "":
- fileName = input("Enter name of text file for training (no-more to start processing)[no-more]: ")
- if(fileName == ""):
- break
- try:
- files.append(open(fileName))
- except IOError:
- print(" file named",fileName, "rejected: cannot be opened")
- wordDict = build_semantic_dictionary(files, open("ignore_words.txt"))
- printOrNot = input("Print Semantic dictionary?[False]:")
- if(printOrNot == "True"):
- print(dict_as_str(wordDict))
- similarity_test(goody.safe_open("Enter name of problem file[synonym-problems.txt]: ", "r", "cannot be opened"),wordDict,cosine_metric)
- # For running batch self-tests
- print()
- import driver
- driver.default_file_name = "bsc5.txt"
- # driver.default_show_traceback = True
- # driver.default_show_exception = True
- # driver.default_show_exception_message = True
- driver.driver()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement