Guest User

Untitled

a guest
Jan 23rd, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.80 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. from __future__ import division
  4. from collections import defaultdict
  5.  
  6. def aufgabe3():
  7.    
  8.     ref_dict = defaultdict(int)
  9.     test_dict = defaultdict(int)
  10.  
  11.     test_corpus = open('dracula.txt','r')
  12.     ref_corpus = open('grimm.txt','r')
  13.    
  14.     output_file = open('terminology_output.txt','w')
  15.     output_file2 = open('unique_words.txt','w')
  16.    
  17.  
  18.     ref_len = 0
  19.     test_len = 0
  20.  
  21.     for line in ref_corpus:
  22.         for w in line.rstrip().split():
  23.             ref_dict[w.lower()] += 1
  24.             ref_len += 1
  25.        
  26.     for line in test_corpus:
  27.         for w in line.rstrip().split():
  28.             test_dict[w.lower()] += 1
  29.             test_len += 1
  30.  
  31.     for w in sorted(test_dict,key=lambda x: chi_square(x,ref_dict,ref_len,test_dict,test_len),reverse=True):
  32.         if w in ref_dict:
  33.             output_file.write(w + ':\t' + str(chi_square(w,ref_dict,ref_len,test_dict,test_len)) + '\n')
  34.    
  35.     for w in sorted(test_dict,key=test_dict.get,reverse=True):
  36.         if w not in ref_dict:
  37.             output_file2.write(w + ':\t' + str(test_dict[w]) + '\n')
  38.    
  39.     print('output written to terminology_output.txt and unique_words.txt')
  40.  
  41.     test_corpus.close()
  42.     ref_corpus.close()
  43.     output_file.close()
  44.     output_file2.close()
  45.  
  46. def chi_square(word,ref_dict,reflen,test_dict,testlen):
  47.    
  48.     # let's ignore words that do not occur in reference corpus for the moment
  49.     # alternatives: smoothing (giving those words an expected probability > 0); generating a separate list with 'unique' words
  50.     if not word in ref_dict:
  51.         return 0
  52.        
  53.     #calculate observed/expected probability through relative frequency
  54.     expected = ref_dict[word]/reflen
  55.     observed = test_dict[word]/testlen
  56.    
  57.     return (observed-expected)**2/expected
Add Comment
Please, Sign In to add comment