Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import bisect
- import random
- import itertools
- import re
- class Markov:
- def __init__(self, n, data, tokenize=False):
- self._tree = {}
- self._n = n
- if tokenize: data = re.sub("([.,?0:;()!-]+\s*)|([.,?!]*\s+)", " ", data).split(" ") # tokenize data if necessary
- prev_cnt, prev_d = self._tree.setdefault(data[0], ([1, 0], {})) # build tree
- for letter in data[1:]:
- if letter == '':
- continue
- prev_d[letter] = prev_d.setdefault(letter, 0) + 1
- prev_cnt[1] += 1
- prev_cnt, prev_d = self._tree.setdefault(letter, ([1, 0], {}))
- for i in range(1, n): # build sums of higher order
- for k, (v_c, v_d) in self._tree.items():
- v_c.append(sum((sv * self._tree[sk][0][i] for sk, sv in v_d.items())))
- def gen(self):
- print("GEN")
- next_key = random.choice(list(self._tree.keys()))
- while True:
- print(" yield: {0}".format(next_key))
- yield next_key
- _count, succs = self._tree[next_key]
- keys = list(succs.keys())
- weights = [succs[key] * self._tree[key][0][self._n - 2] for key in keys]
- cumul_dist = list(itertools.accumulate(weights))
- try:
- next_key = keys[bisect.bisect(cumul_dist, random.random() * cumul_dist[-1])]
- except IndexError:
- break
- def __iter__(self):
- self.__it__ = self.gen()
- return self
- def __next__(self):
- while True:
- try:
- return next(self.__it__)
- except StopIteration:
- self.__it__ = self.gen()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement