Guest User

Untitled

a guest
Jun 29th, 2017
179
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from collections import defaultdict, namedtuple
  2. import cPickle
  3. from random import choice, randint, shuffle
  4.  
  5. class Markov(object):
  6.     """A class that analyzes the word occurance patterns of a given
  7.    text file (the corpus) and can generate random text in the style
  8.    of that corpus."""
  9.  
  10.     def __init__(self, corpus_file):
  11.         self.corpus = open(corpus_file)
  12.         self.text = self.corpus.read()
  13.         self.words = self.text.split()
  14.  
  15.     def make_ngrams(self, n, inputlist):
  16.         """Make ngrams of every n consecutive
  17.        words to feed the dictionary function, AS LIST."""
  18.         for x in range(0, len(inputlist)-n):
  19.             yield [inputlist[x+i] for i in range(n)]
  20.  
  21.     def make_dictionary(self, n, inputlist=None):
  22.         """For every ngram, takes first n-1 words as key, and last as value."""
  23.  
  24.         # TODO: make case/punct-insensitive?
  25.         if inputlist is None:
  26.             inputlist = self.words
  27.  
  28.         temp_dict = defaultdict(list)
  29.         for wordlist in self.make_ngrams(n, inputlist):
  30.             final_word = wordlist.pop()
  31.             temp_dict[tuple(wordlist)].append(final_word)
  32.  
  33.         return temp_dict
  34.  
  35.     def generate(self, length=100, n=3):
  36.         """Make random text of given length (using ngrams of the given n)."""
  37.         word_dict = self.make_dictionary(n)
  38.         seed_no = randint(0,len(self.words)-n) # choose random seed
  39.         output = [self.words[seed_no+x] for x in range(n-1)]
  40.         for x in range(n-1, length):
  41.             next_key = tuple(output[-(n-1):])
  42.             output.append(choice(word_dict[next_key]))
  43.  
  44.         return " ".join(output)
  45.  
  46. class POS_Markov(Markov):
  47.     """A class that analyzes both word occurance patterns and
  48.    part-of-speech patterns of a given text file (the corpus) and can
  49.    generate random text in the style of that corpus.
  50.  
  51.    This class expects a text file of a corpus POS-tagged by nltk--a list
  52.    of ("word", "POS") tuples--serialized by cPickle."""
  53.  
  54.     TaggedWord = namedtuple("TaggedWord", ["word", "pos"])
  55.  
  56.     def __init__(self, corpus_file, word_n=3, pos_n=3):
  57.         self.corpus_file = corpus_file
  58.         self.tagged_words = [self.TaggedWord(t[0],t[1]) for t in self.open_serialized(corpus_file)]
  59.         self.words = [tw.word for tw in self.tagged_words]
  60.         self.pos = [tw.pos for tw in self.tagged_words]
  61.         self.word_dictionary = self.make_dictionary(word_n, self.tagged_words)
  62.         self.pos_dictionary = self.make_dictionary(pos_n, self.pos)
  63.         self.pos_n = pos_n
  64.         self.word_n = word_n
  65.  
  66.     def open_serialized(self, filename):
  67.         """Unpickle a given file. Returns contents.
  68.        Expected file contents: a list of ("word", "POS") tuples."""
  69.  
  70.         with open(filename, 'r') as infile:
  71.             data = cPickle.load(infile)
  72.         return data
  73.  
  74.     def get_word_by_pos(self, wordlist, given_pos):
  75.         """Returns a list of items in a given wordlist that are of the
  76.        given part of speech."""
  77.  
  78.         return [item for item in wordlist if item.pos == given_pos]
  79.  
  80.     def generate(self, length=100):
  81.         """Generates random text of given length. First selects next POS;
  82.        if there exists a matching next word of that POS, selects it; and if not,
  83.        selects a different next POS.
  84.  
  85.        Unsure whether this works any better than non-POS generation. At a guess,
  86.        will work best where pos_n > word_n."""
  87.  
  88.         seed_no = randint(0,len(self.tagged_words)-self.pos_n) # choose random seed
  89.         output = [self.tagged_words[seed_no+x] for x in range(self.pos_n-1)]
  90.         for x in range(self.pos_n-1, length):
  91.             next_pos_key = tuple([tw.pos for tw in output[-(self.pos_n-1):]])
  92.             next_pos_choices = self.pos_dictionary[next_pos_key]
  93.             next_word_key = tuple(output[-(self.word_n-1):])
  94.             next_picked = False
  95.             while not next_picked:
  96.                 shuffle(next_pos_choices)
  97.                 next_pos = next_pos_choices.pop()
  98.                 choices = self.get_word_by_pos(self.word_dictionary[next_word_key], next_pos)
  99.                 if choices:
  100.                     output.append(choice(choices))
  101.                     next_picked = True
  102.                 else:
  103.                     pass
  104.  
  105.         return " ".join([tw.word for tw in output])
  106.  
  107. mymarkov = Markov("corpus.txt")
  108.  
  109. print(mymarkov.generate(500, n=7))
RAW Paste Data