Untitled

# Submitter: alyssat3(Tan, Alyssa)
# Partner  : chopkin(Hopkins, Courvoisier)
# We certify that we worked cooperatively on this programming
#   assignment, according to the rules for pair programming
import re                               # used in my sentence_at_a_time generator function
import math                             # use in cosine_meteric
import prompt                           # for use in script
import goody                            # for use in script
from collections import defaultdict     #  dicts and defaultdictsare == when they have the same keys/associations


# For use in build_semantic_dictionary: see problem specifications
def sentence_at_a_time(open_file : open, ignore_words : {str}) -> [str]:
    end_punct    = re.compile('[.?\!;:]')
    remove_punct = re.compile(r'(,|\'|"|\*|\(|\)|--|'+chr(8220)+'|'+chr(8221)+')')
    prev   = []
    answer = []
    for l in open_file:
        l = remove_punct.sub(' ',l.lower())
        prev = prev + l.split()
        while prev:
            w = prev.pop(0)
            if end_punct.search(w):
                while end_punct.search(w):
                    w = w[0:-1]
                if w != '' and w not in ignore_words:
                    if end_punct.search(w):
                        print(w)
                    answer.append(w)
                    yield answer
                    answer = []
            else:
                if w != '' and w not in ignore_words:
                    answer.append(w)

    # handle special case of last sentence missing final punctuation
    if answer:
        yield answer


def build_semantic_dictionary(training_files : [open], ignore_file : open) -> {str:{str:int}}:
    ignored = set()
    returnDict = defaultdict(lambda :defaultdict(int))
    for i in ignore_file:
        i = i.rstrip()
        ignored.add(i)
    for line in training_files:
        for s in sentence_at_a_time(line, ignored):
            for word in s:
                for other in s:
                    if other != word:
                        returnDict[word][other] += 1
        line.close()
    ignore_file.close()
    return returnDict

def dict_as_str(semantic : {str:{str:int}}) -> str:
    returnStr = ""
    for k, v in sorted(semantic.items()):
        returnStr += "  context for " + k + " ="
        for key, value in sorted(v.items()):
            returnStr += " " + key + "@" + str(value) + ","
        returnStr = returnStr[:-1]
        returnStr += "\n"
    minimum = min([len(i) for i in semantic.values()])
    maximum = max([len(i) for i in semantic.values()])
    returnStr += "  min/max context lengths = " + str(minimum) + "/" + str(maximum) + "\n"
    return returnStr


def cosine_metric(context1 : {str:int}, context2 : {str:int}) -> float:
    if context1 == {} or context2 == {}:
        raise ZeroDivisionError("Not divisible by 0")
    top = 0
    num = 0
    for i in context1.keys():
        top += context1[i] * context2.get(i,0)
        num += context1[i] ** 2
    bottom = math.sqrt(num)
    num = 0
    for i in context2.values():
        num += i ** 2
    bottom *= math.sqrt(num)
    return top/bottom

def most_similar(word : str, choices : [str], semantic : {str:{str:int}}, metric : callable) -> str:
    numberList = []
    for i in choices:
        numberList.append(metric(semantic[word],semantic[i]))
    return choices[numberList.index(max(numberList))]


def similarity_test(test_file : open, semantic : {str:{str:int}}, metric : callable) -> str:
    returnStr= ""
    correct = 0
    incorrect = 0
    for line in test_file:
        line = line.rstrip("\n")
        wordList = line.split()
        word = wordList[0]
        answer = wordList[-1]
        checkList = wordList[1:len(wordList) - 1]
        try:
            choice = most_similar(word, checkList, semantic, metric)
            if choice == answer:
                returnStr += "  Correct: '" + word + "' is most like '" + choice + "' from " + str(checkList) + "\n"
                correct += 1
            else:
                returnStr += "  Incorrect: '" + word + "' is most like '" + answer + "', not '" + choice + "' from " + str(checkList) + "\n"
                incorrect += 1
        except ZeroDivisionError:
            returnStr += "  Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
            incorrect += 1
        except KeyError:
            returnStr += "  Metric failure: could not choose synonym for '" + word + "' from " + str(checkList) + "\n"
            incorrect += 1
    test_file.close()
    returnStr += str(round((correct / (incorrect + correct)) * 100, 1)) + "% correct\n"
    return returnStr

# Script

if __name__ == '__main__':
    # Write script here
    fileName = "file"
    files = []
    while fileName != "":
        fileName = input("Enter name of text file for training (no-more to start processing)[no-more]: ")
        if(fileName == ""):
            break
        try:
            files.append(open(fileName))
        except IOError:
            print("  file named",fileName, "rejected: cannot be opened")
    wordDict = build_semantic_dictionary(files, open("ignore_words.txt"))
    printOrNot = input("Print Semantic dictionary?[False]:")
    if(printOrNot == "True"):
        print(dict_as_str(wordDict))
    similarity_test(goody.safe_open("Enter name of problem file[synonym-problems.txt]: ", "r", "cannot be opened"),wordDict,cosine_metric)
    # For running batch self-tests
    print()
    import driver
    driver.default_file_name = "bsc5.txt"
#     driver.default_show_traceback = True
#     driver.default_show_exception = True
#     driver.default_show_exception_message = True
    driver.driver()