Guest User

Untitled

a guest
Jul 15th, 2018
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.37 KB | None | 0 0
  1. from math import log
  2. from collections import defaultdict
  3.  
  4. class BigramManager:
  5.  
  6.   class BigramGroup:
  7.  
  8.     def __init__(self, first_token, second_token):
  9.       self.second_parts = defaultdict(lambda: 0)
  10.       self.first_token = first_token
  11.       self.second_parts[second_token] = 1
  12.       self.total_count = 1
  13.  
  14.     def add_bigram(self, second_token):
  15.       self.second_parts[second_token] += 1
  16.       self.total_count += 1
  17.       pass
  18.  
  19.     def get_bigram_count(self, second_token):
  20.       return self.second_parts[second_token]
  21.       pass
  22.  
  23.     def get_freq_of_freq(self):
  24.       res = defaultdict(lambda: 0)
  25.       for (_, count) in self.second_parts.items():
  26.         res[count] = res[count] + 1
  27.       return res
  28.  
  29.     def __str__(self):
  30.       result = 'First token: "{0}"\n'.format(self.first_token)
  31.       result += 'Tokens: '
  32.       for (token, count) in self.second_parts.items():
  33.         result += '"{0}"={1} '.format(token, count)
  34.       return result
  35.  
  36.     def __len__(self):
  37.       return len(self.second_parts)
  38. ##-----------------------------------------------------------------------------
  39.  
  40.   def __init__(self):
  41.     self.bigrams = defaultdict(lambda: 0)
  42.     pass
  43.  
  44.   def add(self, first_token, second_token):
  45.     bgr = self.bigrams[first_token]
  46.     if bgr:
  47.       bgr.add_bigram(second_token)
  48.     else:
  49.       self.bigrams[first_token] = self.BigramGroup(first_token, second_token)
  50.     pass
  51.  
  52.   def get_unigram_freq(self, first_token):
  53.     return len(self.bigrams)
  54.  
  55.   def get_bigram_freq(self, first_token, second_token):
  56.     bgr = self.bigrams[first_token]
  57.     if bgr:
  58.       return bgr.get_bigram_count(second_token)
  59.     return 0
  60.  
  61.   def get_total_unigrams_count(self):
  62.     return len(self.bigrams)
  63.  
  64.   def get_freq_of_freq(self):
  65.     res = defaultdict(lambda: 0)
  66.     for (_, bgr) in self.bigrams.items():
  67.       r1 = bgr.get_freq_of_freq()
  68.       for (c, N) in r1.items():
  69.         res[c] = res[c] + N
  70.     return res
  71.  
  72.   def __str__(self):
  73.     result = ''
  74.     for bgr in self.bigrams:
  75.       result += str(bgr) + '\n\n'
  76.     return result
  77.  
  78.    
  79.    
  80.  
  81.  
  82. class LaplaceBigramLanguageModel:
  83.  
  84.   def __init__(self, corpus):
  85.     """Initialize your data structures in the constructor."""
  86.     # TODO your code here
  87.     self.bigrams = BigramManager()
  88.     self.train(corpus)
  89.     pass
  90.  
  91.   def train(self, corpus):
  92.     """ Takes a corpus and trains your language model.
  93.        Compute any counts or other corpus statistics in this function.
  94.    """  
  95.     # TODO your code here
  96.     for sentence in corpus.corpus:
  97.       i = 0
  98.       while i < len(sentence.data)-1:
  99.         token1 = sentence.data[i].word
  100.         token2 = sentence.data[i+1].word
  101.         self.bigrams.add(token1, token2)
  102.         i += 1
  103.     self.V = self.bigrams.get_total_unigrams_count()
  104.     pass
  105.  
  106.   def score(self, sentence):
  107.     """ Takes a list of strings as argument and returns the log-probability of the
  108.        sentence using your language model. Use whatever data you computed in train() here.
  109.    """
  110.     # TODO your code here
  111.     score = 0.0
  112.     i = 1
  113.     while i < len(sentence):
  114.       context_token = sentence[i-1]
  115.       token = sentence[i]
  116.       bigram_count = self.bigrams.get_bigram_freq(context_token, token)
  117.       unigram_count = self.bigrams.get_unigram_freq(context_token)
  118.       score += log(bigram_count + 1)
  119.       score -= log(unigram_count + self.V)
  120.       i += 1
  121.     return score
Add Comment
Please, Sign In to add comment