Advertisement
Guest User

Untitled

a guest
Mar 25th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.79 KB | None | 0 0
  1. import re, random
  2. import numpy as np
  3. import json
  4. poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt']
  5. poem_lines = []
  6. firstword_count_dict = {}
  7. firstword_prob_dict = {}
  8.  
  9. for poem_file in poem_files:
  10. f = open(poem_file, 'r')
  11.  
  12. for line in f:
  13. if len(line) > 1:
  14. words = re.findall('\w+', line)
  15. if len(words) > 1:
  16. poem_lines.append(words)
  17. f.close()
  18.  
  19. #print poem_lines
  20.  
  21. unicorns = set()
  22. words = []
  23. last = 0
  24. lines = 0
  25. for line in poem_lines:
  26. lines += 1
  27. if len(line) >= 1 and line[-1] == 'weary':
  28. last += 1
  29. for word in line:
  30. unicorns.add(word)
  31. words.append(word)
  32.  
  33. # firstword_counts = {}
  34. #
  35. # for line in poem_lines:
  36. # if len(line) > 2:
  37. # first_word = line[0]
  38. #
  39. # if first_word in firstword_counts:
  40. # firstword_counts[first_word] += 1
  41. # else:
  42. # firstword_counts[first_word] = 1
  43. #
  44. # print firstword_counts
  45. # words = words.replace(',', '').replace('.', ' ')
  46. #
  47. # word_states = re.findall('\w+', words)
  48. #
  49. # print word_states
  50. # print len(set(word_states))
  51. #
  52. # counts_dict = {}
  53. #
  54. # for i in range(len(word_states)-1):
  55. # first_word = word_states[i]
  56. # next_word = word_states[i+1]
  57. #
  58. # if (first_word, next_word) in counts_dict:
  59. # counts_dict[(first_word,next_word)] += 1
  60. # else:
  61. # counts_dict[(first_word,next_word)] = 1
  62. #
  63. # transition_probabilities = {}
  64. # s = sum(counts_dict.values())
  65. #
  66. #
  67. # for key in counts_dict:
  68. # transition_probabilities[key] = float(counts_dict[key])/s
  69. # print transition_probabilities
  70. # # float(counts_dict.keys())/s
  71. # #for x in range(10):
  72. # # sentence = ''
  73. # # for i in range(10):
  74. # # word = results[random.randint(0, len(results) - 1)]
  75. # # sentence += ' ' + word
  76. # # print sentence
  77.  
  78. # next_word_counts_dict = {}
  79. # for word_list in poem_lines:
  80. # word_list.append('\n')
  81. # for i in range(len(word_list) - 1):
  82. # thisword = word_list[i]
  83. # nextword = word_list[i+1]
  84. #
  85. # if thisword not in next_word_counts_dict:
  86. # next_word_counts_dict[thisword] = {nextword:1}
  87. # else:
  88. # if nextword not in next_word_counts_dict[thisword]:
  89. # next_word_counts_dict[thisword][nextword] = 1
  90. # else:
  91. # next_word_counts_dict[thisword][nextword] += 1
  92.  
  93. for line in poem_lines:
  94. if len(line) > 0:
  95. first_word = line[0]
  96.  
  97. if first_word in firstword_count_dict:
  98. firstword_count_dict[first_word] += 1
  99. else:
  100. firstword_count_dict[first_word] = 1
  101.  
  102. count_sum = sum(firstword_count_dict.values())
  103.  
  104. for first_word in firstword_count_dict:
  105. firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum)
  106.  
  107. print firstword_prob_dict
  108.  
  109. next_word_counts_dict = {}
  110. for word_list in poem_lines:
  111. for i in range(len(word_list)):
  112. thisword = word_list[i]
  113. if i < len(word_list)-1:
  114. nextword = word_list[i+1]
  115. else:
  116. nextword = '\n'
  117.  
  118. if thisword not in next_word_counts_dict:
  119. next_word_counts_dict[thisword] = {nextword:1}
  120. else:
  121. if nextword not in next_word_counts_dict[thisword]:
  122. next_word_counts_dict[thisword][nextword] = 1
  123. else:
  124. next_word_counts_dict[thisword][nextword] += 1
  125.  
  126. for thisword in next_word_counts_dict:
  127. count_sum = sum(next_word_counts_dict[thisword].values())
  128. for nextword in next_word_counts_dict[thisword]:
  129. old_next_word_count = next_word_counts_dict[thisword][nextword]
  130. next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum)
  131.  
  132. first_states = []
  133. pp = []
  134.  
  135. for key in firstword_prob_dict:
  136. first_states.append(key)
  137. pp.append(firstword_prob_dict[key])
  138. print first_states
  139. print pp
  140.  
  141. #for i in range(len(next_word_counts_dict.items)):
  142. #print next_word_counts_dict.items()
  143.  
  144. for x in range(1000):
  145. state = np.random.choice(first_states, p = pp)
  146. sentence = [state]
  147. for i in range(9):
  148. next_states = []
  149. p = []
  150. if state in next_word_counts_dict:
  151. for pair in next_word_counts_dict[state].items():
  152. next_states.append(pair[0])
  153. p.append(pair[1])
  154. state = np.random.choice(next_states, p = p)
  155. sentence.append(state)
  156. print ' '.join([w for w in sentence if w != '\n'])
  157.  
  158.  
  159.  
  160. #for word in next_word_counts_dict:
  161. # print next_word_counts_dict[word].items()
  162. # np.random.choice(word, p=)
  163. # word, ':', next_word_counts_dict[word]
  164. print len(next_word_counts_dict)
  165.  
  166. output_file = open('poem_model.json', "w")
  167. output_file.write(json.dumps(next_word_counts_dict))
  168. output_file.close()
  169.  
  170. output_file2 = open('poem_model_firstword.json', "w")
  171. output_file2.write(json.dumps(firstword_prob_dict))
  172. output_file2.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement