Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from __future__ import division
- from collections import defaultdict
- def aufgabe3():
- ref_dict = defaultdict(int)
- test_dict = defaultdict(int)
- test_corpus = open('dracula.txt','r')
- ref_corpus = open('grimm.txt','r')
- output_file = open('terminology_output.txt','w')
- output_file2 = open('unique_words.txt','w')
- ref_len = 0
- test_len = 0
- for line in ref_corpus:
- for w in line.rstrip().split():
- ref_dict[w.lower()] += 1
- ref_len += 1
- for line in test_corpus:
- for w in line.rstrip().split():
- test_dict[w.lower()] += 1
- test_len += 1
- for w in sorted(test_dict,key=lambda x: chi_square(x,ref_dict,ref_len,test_dict,test_len),reverse=True):
- if w in ref_dict:
- output_file.write(w + ':\t' + str(chi_square(w,ref_dict,ref_len,test_dict,test_len)) + '\n')
- for w in sorted(test_dict,key=test_dict.get,reverse=True):
- if w not in ref_dict:
- output_file2.write(w + ':\t' + str(test_dict[w]) + '\n')
- print('output written to terminology_output.txt and unique_words.txt')
- test_corpus.close()
- ref_corpus.close()
- output_file.close()
- output_file2.close()
- def chi_square(word,ref_dict,reflen,test_dict,testlen):
- # let's ignore words that do not occur in reference corpus for the moment
- # alternatives: smoothing (giving those words an expected probability > 0); generating a separate list with 'unique' words
- if not word in ref_dict:
- return 0
- #calculate observed/expected probability through relative frequency
- expected = ref_dict[word]/reflen
- observed = test_dict[word]/testlen
- return (observed-expected)**2/expected
Add Comment
Please, Sign In to add comment