Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re, random
- import numpy as np
- import json
- poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt']
- poem_lines = []
- firstword_count_dict = {}
- firstword_prob_dict = {}
- for poem_file in poem_files:
- f = open(poem_file, 'r')
- for line in f:
- if len(line) > 1:
- words = re.findall('\w+', line)
- if len(words) > 1:
- poem_lines.append(words)
- f.close()
- #print poem_lines
- unicorns = set()
- words = []
- last = 0
- lines = 0
- for line in poem_lines:
- lines += 1
- if len(line) >= 1 and line[-1] == 'weary':
- last += 1
- for word in line:
- unicorns.add(word)
- words.append(word)
- # firstword_counts = {}
- #
- # for line in poem_lines:
- # if len(line) > 2:
- # first_word = line[0]
- #
- # if first_word in firstword_counts:
- # firstword_counts[first_word] += 1
- # else:
- # firstword_counts[first_word] = 1
- #
- # print firstword_counts
- # words = words.replace(',', '').replace('.', ' ')
- #
- # word_states = re.findall('\w+', words)
- #
- # print word_states
- # print len(set(word_states))
- #
- # counts_dict = {}
- #
- # for i in range(len(word_states)-1):
- # first_word = word_states[i]
- # next_word = word_states[i+1]
- #
- # if (first_word, next_word) in counts_dict:
- # counts_dict[(first_word,next_word)] += 1
- # else:
- # counts_dict[(first_word,next_word)] = 1
- #
- # transition_probabilities = {}
- # s = sum(counts_dict.values())
- #
- #
- # for key in counts_dict:
- # transition_probabilities[key] = float(counts_dict[key])/s
- # print transition_probabilities
- # # float(counts_dict.keys())/s
- # #for x in range(10):
- # # sentence = ''
- # # for i in range(10):
- # # word = results[random.randint(0, len(results) - 1)]
- # # sentence += ' ' + word
- # # print sentence
- # next_word_counts_dict = {}
- # for word_list in poem_lines:
- # word_list.append('\n')
- # for i in range(len(word_list) - 1):
- # thisword = word_list[i]
- # nextword = word_list[i+1]
- #
- # if thisword not in next_word_counts_dict:
- # next_word_counts_dict[thisword] = {nextword:1}
- # else:
- # if nextword not in next_word_counts_dict[thisword]:
- # next_word_counts_dict[thisword][nextword] = 1
- # else:
- # next_word_counts_dict[thisword][nextword] += 1
- for line in poem_lines:
- if len(line) > 0:
- first_word = line[0]
- if first_word in firstword_count_dict:
- firstword_count_dict[first_word] += 1
- else:
- firstword_count_dict[first_word] = 1
- count_sum = sum(firstword_count_dict.values())
- for first_word in firstword_count_dict:
- firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum)
- print firstword_prob_dict
- next_word_counts_dict = {}
- for word_list in poem_lines:
- for i in range(len(word_list)):
- thisword = word_list[i]
- if i < len(word_list)-1:
- nextword = word_list[i+1]
- else:
- nextword = '\n'
- if thisword not in next_word_counts_dict:
- next_word_counts_dict[thisword] = {nextword:1}
- else:
- if nextword not in next_word_counts_dict[thisword]:
- next_word_counts_dict[thisword][nextword] = 1
- else:
- next_word_counts_dict[thisword][nextword] += 1
- for thisword in next_word_counts_dict:
- count_sum = sum(next_word_counts_dict[thisword].values())
- for nextword in next_word_counts_dict[thisword]:
- old_next_word_count = next_word_counts_dict[thisword][nextword]
- next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum)
- first_states = []
- pp = []
- for key in firstword_prob_dict:
- first_states.append(key)
- pp.append(firstword_prob_dict[key])
- print first_states
- print pp
- #for i in range(len(next_word_counts_dict.items)):
- #print next_word_counts_dict.items()
- for x in range(1000):
- state = np.random.choice(first_states, p = pp)
- sentence = [state]
- for i in range(9):
- next_states = []
- p = []
- if state in next_word_counts_dict:
- for pair in next_word_counts_dict[state].items():
- next_states.append(pair[0])
- p.append(pair[1])
- state = np.random.choice(next_states, p = p)
- sentence.append(state)
- print ' '.join([w for w in sentence if w != '\n'])
- #for word in next_word_counts_dict:
- # print next_word_counts_dict[word].items()
- # np.random.choice(word, p=)
- # word, ':', next_word_counts_dict[word]
- print len(next_word_counts_dict)
- output_file = open('poem_model.json', "w")
- output_file.write(json.dumps(next_word_counts_dict))
- output_file.close()
- output_file2 = open('poem_model_firstword.json', "w")
- output_file2.write(json.dumps(firstword_prob_dict))
- output_file2.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement