Advertisement
sds-michael

markov-char.py - Generate words using markov chains

Jul 30th, 2015
311
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.80 KB | None | 0 0
  1. # markov-char.py
  2. # Generate a bunch of new unique words from input words using markov chains
  3. #
  4. # Author: Michael Davies michael@spaceduststudios.com
  5. #
  6. # Usage: markov-char.py input_file markov_state_size
  7. #
  8. # input_file should contain one word per line
  9. # markov_state_size works well between 2-8 depending on sample size
  10.  
  11. import random
  12. import sys
  13.  
  14. file = sys.argv[1]
  15. nonchar = '\n'
  16. markov_order = int(sys.argv[2])
  17. markov_char = nonchar * markov_order
  18.  
  19. words = []
  20. with open(file, 'r') as f:
  21.     words = f.readlines()
  22. if not words[-1].endswith(nonchar):
  23.     words[-1] += nonchar
  24. words = set(words)
  25. longest_word_len = max([len(word) for word in words]) - 1 # remove linefeed
  26.  
  27. markov_dict = {}
  28. for word in words:
  29.     for char in word.decode('ascii', 'ignore').encode('ascii'):
  30.         if char == nonchar:
  31.             markov_dict.setdefault( tuple(markov_char), [] ).append(nonchar)
  32.             markov_char = nonchar * markov_order
  33.         else:
  34.             markov_dict.setdefault( tuple(markov_char), [] ).append(char)
  35.             markov_char = markov_char[1:] + char
  36.  
  37. def markov_choice(word = '', markov_char = nonchar * markov_order):
  38.     char = nonchar
  39.     char = random.choice(markov_dict[tuple(markov_char)])
  40.     if char == nonchar:
  41.         return word
  42.     word += char
  43.     return markov_choice(word, markov_char[1:] + char)
  44.  
  45. print 'Input words: ' + str(len(words))
  46. print 'Longest input word length: ' + str(longest_word_len)
  47.  
  48. print 'New unique words:'
  49. new_words = []
  50. for i in range(100000):
  51.     word = markov_choice()
  52.     if len(word) > longest_word_len:
  53.         continue
  54.     if word + '\n' in words:
  55.         continue
  56.     if word + 's\n' in words: # check plural cases
  57.         continue
  58.     if word.endswith('s') and word[:-1] + '\n' in words: # check plural cases
  59.         continue
  60.     new_words.append(word)
  61. new_words = set(new_words)
  62. for new_word in new_words:
  63.     print new_word
  64. print len(new_words)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement