Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # markov-char.py
- # Generate a bunch of new unique words from input words using markov chains
- #
- # Author: Michael Davies michael@spaceduststudios.com
- #
- # Usage: markov-char.py input_file markov_state_size
- #
- # input_file should contain one word per line
- # markov_state_size works well between 2-8 depending on sample size
- import random
- import sys
- file = sys.argv[1]
- nonchar = '\n'
- markov_order = int(sys.argv[2])
- markov_char = nonchar * markov_order
- words = []
- with open(file, 'r') as f:
- words = f.readlines()
- if not words[-1].endswith(nonchar):
- words[-1] += nonchar
- words = set(words)
- longest_word_len = max([len(word) for word in words]) - 1 # remove linefeed
- markov_dict = {}
- for word in words:
- for char in word.decode('ascii', 'ignore').encode('ascii'):
- if char == nonchar:
- markov_dict.setdefault( tuple(markov_char), [] ).append(nonchar)
- markov_char = nonchar * markov_order
- else:
- markov_dict.setdefault( tuple(markov_char), [] ).append(char)
- markov_char = markov_char[1:] + char
- def markov_choice(word = '', markov_char = nonchar * markov_order):
- char = nonchar
- char = random.choice(markov_dict[tuple(markov_char)])
- if char == nonchar:
- return word
- word += char
- return markov_choice(word, markov_char[1:] + char)
- print 'Input words: ' + str(len(words))
- print 'Longest input word length: ' + str(longest_word_len)
- print 'New unique words:'
- new_words = []
- for i in range(100000):
- word = markov_choice()
- if len(word) > longest_word_len:
- continue
- if word + '\n' in words:
- continue
- if word + 's\n' in words: # check plural cases
- continue
- if word.endswith('s') and word[:-1] + '\n' in words: # check plural cases
- continue
- new_words.append(word)
- new_words = set(new_words)
- for new_word in new_words:
- print new_word
- print len(new_words)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement