Advertisement
Guest User

Untitled

a guest
Apr 25th, 2018
20
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.51 KB | None | 0 0
  1. '''
  2. CPSC 420 -- Program #0
  3. Ryan Harris, University of Mary Washington
  4.  
  5. This program builds n-gram models, then generates simulated samples based
  6. on the models.
  7. '''
  8. import sys
  9. import random as r
  10.  
  11. PERMITTED_CHARS = [' ', '-', '\'', ',', '.']
  12.  
  13. '''
  14. Removes non-alphanumeric characters from a given corpus.
  15. IN -- Corpus to parse.
  16. OUT -- Parsed corpus.
  17. '''
  18. def parse(corpus):
  19. parsed = list()
  20.  
  21. for line in corpus:
  22. # for each character in the line, join together all permitted characters
  23. newLine = ''.join(ch for ch in line if ch.isalnum() or ch in PERMITTED_CHARS)
  24. parsed.append(newLine)
  25. parsedCorpus = ' '.join(line for line in parsed)
  26. parsedCorpus = parsedCorpus.lower()
  27. return parsedCorpus
  28. '''
  29. Gets n-grams from a given text.
  30. IN -- text The text to analyze.
  31. n The n in n-gram, AKA the size of phrases to get.
  32. OUT -- ngrams A list of n-grams from the text.
  33. '''
  34. def build(text, n):
  35. split = text.split()
  36. ngrams = []
  37. count = {}
  38.  
  39. # unigrams
  40. if n is 1:
  41. for i in range(0, len(split)):
  42. if split[i] in count:
  43. count[split[i]] += 1
  44. else:
  45. count[split[i]] = 1
  46.  
  47. # bigrams
  48. elif n is 2:
  49. for i in range(0, len(split)-1):
  50. ngrams.append(split[i] + ' ' + split[i+1])
  51.  
  52. for word in ngrams:
  53. split = word.split()
  54.  
  55. if split[0] in count:
  56. if split[1] in count[split[0]]:
  57. count[split[0]][split[1]] += 1
  58. else:
  59. count[split[0]][split[1]] = 1
  60. else:
  61. count[split[0]] = {}
  62. count[split[0]][split[1]] = 1
  63.  
  64. # trigrams
  65. elif n is 3:
  66. for i in range(0, len(split)-2):
  67. ngrams.append(split[i] + ' ' + split[i+1] + ' ' + split[i+2])
  68.  
  69. for word in ngrams:
  70. split = word.split()
  71.  
  72. if split[0] in count:
  73. if split[1] in count[split[0]]:
  74. if split[2] in count[split[0]][split[1]]:
  75. count[split[0]][split[1]][split[2]] += 1
  76. else:
  77. count[split[0]][split[1]][split[2]] = 1
  78. else:
  79. count[split[0]][split[1]] = {}
  80. count[split[0]][split[1]][split[2]] = 1
  81. else:
  82. count[split[0]] = {}
  83. count[split[0]][split[1]] = {}
  84. count[split[0]][split[1]][split[2]] = 1
  85.  
  86.  
  87. print(count)
  88. return count
  89.  
  90. '''
  91. Simulates a n-word text from a given dictionary of n-grams.
  92. IN -- unigrams A dictionary of unigrams and counts.
  93. bigrams A dictionary of bigrams and counts.
  94. trigrams A dictionary of trigrams and counts.
  95. n Number of words to generate.
  96. OUT -- None
  97. '''
  98. def sim(unigrams, bigrams, trigrams, n):
  99. print('Simulating ', end='')
  100. gen = str()
  101. unikeys = list(unigrams.keys())
  102. univals = list(unigrams.values())
  103.  
  104. if len(trigrams):
  105. print("trigrams...")
  106.  
  107. first = str()
  108. second = str()
  109. for i in range(0, n):
  110. if i is 0:
  111. first = generateUnigram(unikeys, univals)
  112. gen += first + ' '
  113. if i is 1:
  114. second = generateBigram(first, unikeys, univals)
  115. gen += second + ' '
  116.  
  117. else:
  118. third = generateTrigram(first, second, unikeys, univals)
  119. gen += third + ' '
  120.  
  121. # shift each word up per iteration
  122. tmp = second
  123. second = third
  124. first = tmp
  125.  
  126. elif len(bigrams):
  127. print("bigrams...")
  128.  
  129. prev = str()
  130. for i in range(0, n):
  131. # always start with unigram model
  132. if i is 0:
  133. prev = r.choices(unikeys, univals)[0]
  134. gen += prev + ' '
  135. else:
  136. prev = generateBigram(prev, unikeys, univals)
  137. gen += prev + ' '
  138.  
  139. elif len(unigrams):
  140. print('unigrams...')
  141.  
  142. for i in range(0, n):
  143. gen += generateUnigram(unikeys, univals) + ' '
  144.  
  145. print(gen)
  146.  
  147. '''
  148. Generates a single word based on the trigram model.
  149. IN -- first The second word prior to the one being generated.
  150. second The first word prior to the one being generated.
  151. unikeys A list of unigram keys. (for generateUnigram)
  152. univals A list of unigram values. (for generateUnigram)
  153. OUT -- third The generated word.
  154. '''
  155. def generateTrigram(first, second, unikeys, univals):
  156. # if trigrams[first][second] exists, use that distribution
  157. # if trigrams[first] exists, use bigram distribution
  158. # if none exist, use unigram distribution
  159. if first in trigrams:
  160. if second in trigrams[first]: # trigrams[first][second]
  161. tempKeyDist = []
  162. tempValDist = []
  163. for key, val in trigrams[first][second].items():
  164. tempKeyDist.append(key)
  165. tempValDist.append(val)
  166. third = r.choices(tempKeyDist, tempValDist)[0]
  167.  
  168. else: # trigrams[first]
  169. third = generateBigram(second, unikeys, univals)
  170.  
  171. else:
  172. third = generateUnigram(unikeys, univals)
  173.  
  174. return third
  175.  
  176. '''
  177. Generates a single word based on the bigram model.
  178. IN -- prev The first word prior to the one being generated.
  179. unikeys A list of unigram keys. (for generateUnigram)
  180. univals A list of unigram values. (for generateUnigram)
  181. OUT -- prev The generated word.
  182. '''
  183. def generateBigram(prev, unikeys, univals):
  184. if prev in bigrams:
  185. tempKeyDist = []
  186. tempValDist = []
  187. # format the distribution nicely for r.choices
  188. for key, val in bigrams[prev].items():
  189. tempKeyDist.append(key)
  190. tempValDist.append(val)
  191.  
  192. prev = r.choices(tempKeyDist, tempValDist)[0]
  193. else:
  194. prev = generateUnigram(unikeys, univals)
  195.  
  196. return prev
  197.  
  198. '''
  199. Generates a single word based on the unigram model.
  200. IN -- keys A list of unigram keys.
  201. vals A list of unigram values.
  202. OUT -- prev The generated word.
  203. '''
  204. def generateUnigram(keys, vals):
  205. prev = r.choices(keys, vals)[0]
  206. return prev
  207.  
  208.  
  209. if __name__ == '__main__':
  210.  
  211. # command line usage
  212. if len(sys.argv) is 4:
  213. corpusPath = sys.argv[1]
  214. sel = eval(sys.argv[2])
  215. num = eval(sys.argv[3])
  216. else:
  217. print("Usage: python3 rharris4_ngram.py [corpus path] [n-gram n (1-3)] [words to generate]")
  218. exit()
  219.  
  220. try:
  221. with open(corpusPath, 'r') as f:
  222. corpus = f.readlines()
  223. except OSError:
  224. print("The file couldn't be opened... it does exist, right?")
  225.  
  226. parsedCorpus = parse(corpus)
  227. unigrams = {}
  228. bigrams = {}
  229. trigrams = {}
  230.  
  231. if sel is 3:
  232. unigrams = build(parsedCorpus, 1)
  233. bigrams = build(parsedCorpus, 2)
  234. trigrams = build(parsedCorpus, 3)
  235. elif sel is 2:
  236. unigrams = build(parsedCorpus, 1)
  237. bigrams = build(parsedCorpus, 2)
  238. elif sel is 1:
  239. unigrams = build(parsedCorpus, 1)
  240. else:
  241. print("This n-gram isn't supported... so sorry!")
  242. exit()
  243.  
  244.  
  245. while True:
  246. key = input("\nReady. Press any key to simulate. Enter q to quit: ")
  247.  
  248. if key is 'q':
  249. exit()
  250. sim(unigrams, bigrams, trigrams, num)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement