Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class KneserNeyLanguageModel(LaplaceLanguageModel):
- """ this code is an example, no need to change anything """
- def __init__(self, lines, n, delta=1.0):
- self.n = n
- self.lines = lines
- self.delta = delta
- counts = count_ngrams(lines, self.n)
- self.vocab = set(token for token_counts in counts.values() for token in token_counts)
- self.probs = self.evaluate_probs(n)
- def evaluate_probs(self, n):
- counts = count_ngrams(self.lines, n)
- probs = defaultdict(Counter)
- if n == 1:
- unique_prev_words = {}
- for line in self.lines:
- line = UNK + ' ' + line + ' ' + EOS
- words = line.split()
- for index in range(1, len(words)):
- if words[index] not in unique_prev_words):
- unique_prev_words[words[index]] = set()
- unique_prev_words[words[index]].add(words[index-1])
- sum_counts_unique = np.sum([len(word) for word in unique_prev_words.values()])
- for word in unique_prev_words:
- probs[tuple()][word] = len(unique_prev_words[word]) / sum_counts_unique
- return probs
- for prefix in counts:
- count_prefix = np.sum(list(counts[prefix].values()))
- for word in counts[prefix]:
- probs[prefix][word] = (max(counts[prefix][word] - self.delta, 0) + self.delta * len(counts[prefix]) * self.evaluate_probs(n - 1)) / count_prefix
- return probs
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement