Advertisement
mfgnik

Untitled

Oct 4th, 2020
1,328
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.52 KB | None | 0 0
  1. class KneserNeyLanguageModel(LaplaceLanguageModel):
  2.     """ this code is an example, no need to change anything """
  3.     def __init__(self, lines, n, delta=1.0):
  4.         self.n = n
  5.         self.lines = lines
  6.         self.delta = delta
  7.         counts = count_ngrams(lines, self.n)
  8.         self.vocab = set(token for token_counts in counts.values() for token in token_counts)
  9.         self.probs = self.evaluate_probs(n)
  10.  
  11.     def evaluate_probs(self, n):
  12.         counts = count_ngrams(self.lines, n)
  13.         probs = defaultdict(Counter)
  14.         if n == 1:
  15.             unique_prev_words = {}
  16.             for line in self.lines:
  17.                 line = UNK + ' ' + line + ' ' + EOS
  18.                 words = line.split()
  19.                 for index in range(1, len(words)):
  20.                     if words[index] not in unique_prev_words):
  21.                         unique_prev_words[words[index]] = set()
  22.                     unique_prev_words[words[index]].add(words[index-1])
  23.             sum_counts_unique = np.sum([len(word) for word in unique_prev_words.values()])
  24.             for word in unique_prev_words:
  25.                 probs[tuple()][word] = len(unique_prev_words[word]) / sum_counts_unique
  26.             return probs
  27.         for prefix in counts:
  28.             count_prefix = np.sum(list(counts[prefix].values()))
  29.             for word in counts[prefix]:
  30.                 probs[prefix][word] = (max(counts[prefix][word] - self.delta, 0)  +  self.delta * len(counts[prefix]) * self.evaluate_probs(n - 1)) / count_prefix
  31.         return probs
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement