Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from math import log
- from collections import defaultdict
- class BigramManager:
- class BigramGroup:
- def __init__(self, first_token, second_token):
- self.second_parts = defaultdict(lambda: 0)
- self.first_token = first_token
- self.second_parts[second_token] = 1
- self.total_count = 1
- def add_bigram(self, second_token):
- self.second_parts[second_token] += 1
- self.total_count += 1
- pass
- def get_bigram_count(self, second_token):
- return self.second_parts[second_token]
- pass
- def get_freq_of_freq(self):
- res = defaultdict(lambda: 0)
- for (_, count) in self.second_parts.items():
- res[count] = res[count] + 1
- return res
- def __str__(self):
- result = 'First token: "{0}"\n'.format(self.first_token)
- result += 'Tokens: '
- for (token, count) in self.second_parts.items():
- result += '"{0}"={1} '.format(token, count)
- return result
- def __len__(self):
- return len(self.second_parts)
- ##-----------------------------------------------------------------------------
- def __init__(self):
- self.bigrams = defaultdict(lambda: 0)
- pass
- def add(self, first_token, second_token):
- bgr = self.bigrams[first_token]
- if bgr:
- bgr.add_bigram(second_token)
- else:
- self.bigrams[first_token] = self.BigramGroup(first_token, second_token)
- pass
- def get_unigram_freq(self, first_token):
- return len(self.bigrams)
- def get_bigram_freq(self, first_token, second_token):
- bgr = self.bigrams[first_token]
- if bgr:
- return bgr.get_bigram_count(second_token)
- return 0
- def get_total_unigrams_count(self):
- return len(self.bigrams)
- def get_freq_of_freq(self):
- res = defaultdict(lambda: 0)
- for (_, bgr) in self.bigrams.items():
- r1 = bgr.get_freq_of_freq()
- for (c, N) in r1.items():
- res[c] = res[c] + N
- return res
- def __str__(self):
- result = ''
- for bgr in self.bigrams:
- result += str(bgr) + '\n\n'
- return result
- class LaplaceBigramLanguageModel:
- def __init__(self, corpus):
- """Initialize your data structures in the constructor."""
- # TODO your code here
- self.bigrams = BigramManager()
- self.train(corpus)
- pass
- def train(self, corpus):
- """ Takes a corpus and trains your language model.
- Compute any counts or other corpus statistics in this function.
- """
- # TODO your code here
- for sentence in corpus.corpus:
- i = 0
- while i < len(sentence.data)-1:
- token1 = sentence.data[i].word
- token2 = sentence.data[i+1].word
- self.bigrams.add(token1, token2)
- i += 1
- self.V = self.bigrams.get_total_unigrams_count()
- pass
- def score(self, sentence):
- """ Takes a list of strings as argument and returns the log-probability of the
- sentence using your language model. Use whatever data you computed in train() here.
- """
- # TODO your code here
- score = 0.0
- i = 1
- while i < len(sentence):
- context_token = sentence[i-1]
- token = sentence[i]
- bigram_count = self.bigrams.get_bigram_freq(context_token, token)
- unigram_count = self.bigrams.get_unigram_freq(context_token)
- score += log(bigram_count + 1)
- score -= log(unigram_count + self.V)
- i += 1
- return score
Add Comment
Please, Sign In to add comment