Guest User

Untitled

a guest
Jun 29th, 2017
325
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.35 KB | None | 0 0
  1. from collections import defaultdict, namedtuple
  2. import cPickle
  3. from random import choice, randint, shuffle
  4.  
  5. class Markov(object):
  6.     """A class that analyzes the word occurance patterns of a given
  7.    text file (the corpus) and can generate random text in the style
  8.    of that corpus."""
  9.  
  10.     def __init__(self, corpus_file):
  11.         self.corpus = open(corpus_file)
  12.         self.text = self.corpus.read()
  13.         self.words = self.text.split()
  14.  
  15.     def make_ngrams(self, n, inputlist):
  16.         """Make ngrams of every n consecutive
  17.        words to feed the dictionary function, AS LIST."""
  18.         for x in range(0, len(inputlist)-n):
  19.             yield [inputlist[x+i] for i in range(n)]
  20.  
  21.     def make_dictionary(self, n, inputlist=None):
  22.         """For every ngram, takes first n-1 words as key, and last as value."""
  23.  
  24.         # TODO: make case/punct-insensitive?
  25.         if inputlist is None:
  26.             inputlist = self.words
  27.  
  28.         temp_dict = defaultdict(list)
  29.         for wordlist in self.make_ngrams(n, inputlist):
  30.             final_word = wordlist.pop()
  31.             temp_dict[tuple(wordlist)].append(final_word)
  32.  
  33.         return temp_dict
  34.  
  35.     def generate(self, length=100, n=3):
  36.         """Make random text of given length (using ngrams of the given n)."""
  37.         word_dict = self.make_dictionary(n)
  38.         seed_no = randint(0,len(self.words)-n) # choose random seed
  39.         output = [self.words[seed_no+x] for x in range(n-1)]
  40.         for x in range(n-1, length):
  41.             next_key = tuple(output[-(n-1):])
  42.             output.append(choice(word_dict[next_key]))
  43.  
  44.         return " ".join(output)
  45.  
  46. class POS_Markov(Markov):
  47.     """A class that analyzes both word occurance patterns and
  48.    part-of-speech patterns of a given text file (the corpus) and can
  49.    generate random text in the style of that corpus.
  50.  
  51.    This class expects a text file of a corpus POS-tagged by nltk--a list
  52.    of ("word", "POS") tuples--serialized by cPickle."""
  53.  
  54.     TaggedWord = namedtuple("TaggedWord", ["word", "pos"])
  55.  
  56.     def __init__(self, corpus_file, word_n=3, pos_n=3):
  57.         self.corpus_file = corpus_file
  58.         self.tagged_words = [self.TaggedWord(t[0],t[1]) for t in self.open_serialized(corpus_file)]
  59.         self.words = [tw.word for tw in self.tagged_words]
  60.         self.pos = [tw.pos for tw in self.tagged_words]
  61.         self.word_dictionary = self.make_dictionary(word_n, self.tagged_words)
  62.         self.pos_dictionary = self.make_dictionary(pos_n, self.pos)
  63.         self.pos_n = pos_n
  64.         self.word_n = word_n
  65.  
  66.     def open_serialized(self, filename):
  67.         """Unpickle a given file. Returns contents.
  68.        Expected file contents: a list of ("word", "POS") tuples."""
  69.  
  70.         with open(filename, 'r') as infile:
  71.             data = cPickle.load(infile)
  72.         return data
  73.  
  74.     def get_word_by_pos(self, wordlist, given_pos):
  75.         """Returns a list of items in a given wordlist that are of the
  76.        given part of speech."""
  77.  
  78.         return [item for item in wordlist if item.pos == given_pos]
  79.  
  80.     def generate(self, length=100):
  81.         """Generates random text of given length. First selects next POS;
  82.        if there exists a matching next word of that POS, selects it; and if not,
  83.        selects a different next POS.
  84.  
  85.        Unsure whether this works any better than non-POS generation. At a guess,
  86.        will work best where pos_n > word_n."""
  87.  
  88.         seed_no = randint(0,len(self.tagged_words)-self.pos_n) # choose random seed
  89.         output = [self.tagged_words[seed_no+x] for x in range(self.pos_n-1)]
  90.         for x in range(self.pos_n-1, length):
  91.             next_pos_key = tuple([tw.pos for tw in output[-(self.pos_n-1):]])
  92.             next_pos_choices = self.pos_dictionary[next_pos_key]
  93.             next_word_key = tuple(output[-(self.word_n-1):])
  94.             next_picked = False
  95.             while not next_picked:
  96.                 shuffle(next_pos_choices)
  97.                 next_pos = next_pos_choices.pop()
  98.                 choices = self.get_word_by_pos(self.word_dictionary[next_word_key], next_pos)
  99.                 if choices:
  100.                     output.append(choice(choices))
  101.                     next_picked = True
  102.                 else:
  103.                     pass
  104.  
  105.         return " ".join([tw.word for tw in output])
  106.  
  107. mymarkov = Markov("corpus.txt")
  108.  
  109. print(mymarkov.generate(500, n=7))
Add Comment
Please, Sign In to add comment