Untitled

# -*- coding: utf-8 -*-

from __future__ import division
from collections import defaultdict

def aufgabe3():

    ref_dict = defaultdict(int)
    test_dict = defaultdict(int)

    test_corpus = open('dracula.txt','r')
    ref_corpus = open('grimm.txt','r')

    output_file = open('terminology_output.txt','w')
    output_file2 = open('unique_words.txt','w')


    ref_len = 0
    test_len = 0

    for line in ref_corpus:
        for w in line.rstrip().split():
            ref_dict[w.lower()] += 1
            ref_len += 1

    for line in test_corpus:
        for w in line.rstrip().split():
            test_dict[w.lower()] += 1
            test_len += 1

    for w in sorted(test_dict,key=lambda x: chi_square(x,ref_dict,ref_len,test_dict,test_len),reverse=True):
        if w in ref_dict:
            output_file.write(w + ':\t' + str(chi_square(w,ref_dict,ref_len,test_dict,test_len)) + '\n')

    for w in sorted(test_dict,key=test_dict.get,reverse=True):
        if w not in ref_dict:
            output_file2.write(w + ':\t' + str(test_dict[w]) + '\n')

    print('output written to terminology_output.txt and unique_words.txt')

    test_corpus.close()
    ref_corpus.close()
    output_file.close()
    output_file2.close()

def chi_square(word,ref_dict,reflen,test_dict,testlen):

    # let's ignore words that do not occur in reference corpus for the moment
    # alternatives: smoothing (giving those words an expected probability > 0); generating a separate list with 'unique' words
    if not word in ref_dict:
        return 0

    #calculate observed/expected probability through relative frequency
    expected = ref_dict[word]/reflen
    observed = test_dict[word]/testlen

    return (observed-expected)**2/expected