Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- CPSC 420 -- Program #0
- Ryan Harris, University of Mary Washington
- This program builds n-gram models, then generates simulated samples based
- on the models.
- '''
- import sys
- import random as r
- PERMITTED_CHARS = [' ', '-', '\'', ',', '.']
- '''
- Removes non-alphanumeric characters from a given corpus.
- IN -- Corpus to parse.
- OUT -- Parsed corpus.
- '''
- def parse(corpus):
- parsed = list()
- for line in corpus:
- # for each character in the line, join together all permitted characters
- newLine = ''.join(ch for ch in line if ch.isalnum() or ch in PERMITTED_CHARS)
- parsed.append(newLine)
- parsedCorpus = ' '.join(line for line in parsed)
- parsedCorpus = parsedCorpus.lower()
- return parsedCorpus
- '''
- Gets n-grams from a given text.
- IN -- text The text to analyze.
- n The n in n-gram, AKA the size of phrases to get.
- OUT -- ngrams A list of n-grams from the text.
- '''
- def build(text, n):
- split = text.split()
- ngrams = []
- count = {}
- # unigrams
- if n is 1:
- for i in range(0, len(split)):
- if split[i] in count:
- count[split[i]] += 1
- else:
- count[split[i]] = 1
- # bigrams
- elif n is 2:
- for i in range(0, len(split)-1):
- ngrams.append(split[i] + ' ' + split[i+1])
- for word in ngrams:
- split = word.split()
- if split[0] in count:
- if split[1] in count[split[0]]:
- count[split[0]][split[1]] += 1
- else:
- count[split[0]][split[1]] = 1
- else:
- count[split[0]] = {}
- count[split[0]][split[1]] = 1
- # trigrams
- elif n is 3:
- for i in range(0, len(split)-2):
- ngrams.append(split[i] + ' ' + split[i+1] + ' ' + split[i+2])
- for word in ngrams:
- split = word.split()
- if split[0] in count:
- if split[1] in count[split[0]]:
- if split[2] in count[split[0]][split[1]]:
- count[split[0]][split[1]][split[2]] += 1
- else:
- count[split[0]][split[1]][split[2]] = 1
- else:
- count[split[0]][split[1]] = {}
- count[split[0]][split[1]][split[2]] = 1
- else:
- count[split[0]] = {}
- count[split[0]][split[1]] = {}
- count[split[0]][split[1]][split[2]] = 1
- print(count)
- return count
- '''
- Simulates a n-word text from a given dictionary of n-grams.
- IN -- unigrams A dictionary of unigrams and counts.
- bigrams A dictionary of bigrams and counts.
- trigrams A dictionary of trigrams and counts.
- n Number of words to generate.
- OUT -- None
- '''
- def sim(unigrams, bigrams, trigrams, n):
- print('Simulating ', end='')
- gen = str()
- unikeys = list(unigrams.keys())
- univals = list(unigrams.values())
- if len(trigrams):
- print("trigrams...")
- first = str()
- second = str()
- for i in range(0, n):
- if i is 0:
- first = generateUnigram(unikeys, univals)
- gen += first + ' '
- if i is 1:
- second = generateBigram(first, unikeys, univals)
- gen += second + ' '
- else:
- third = generateTrigram(first, second, unikeys, univals)
- gen += third + ' '
- # shift each word up per iteration
- tmp = second
- second = third
- first = tmp
- elif len(bigrams):
- print("bigrams...")
- prev = str()
- for i in range(0, n):
- # always start with unigram model
- if i is 0:
- prev = r.choices(unikeys, univals)[0]
- gen += prev + ' '
- else:
- prev = generateBigram(prev, unikeys, univals)
- gen += prev + ' '
- elif len(unigrams):
- print('unigrams...')
- for i in range(0, n):
- gen += generateUnigram(unikeys, univals) + ' '
- print(gen)
- '''
- Generates a single word based on the trigram model.
- IN -- first The second word prior to the one being generated.
- second The first word prior to the one being generated.
- unikeys A list of unigram keys. (for generateUnigram)
- univals A list of unigram values. (for generateUnigram)
- OUT -- third The generated word.
- '''
- def generateTrigram(first, second, unikeys, univals):
- # if trigrams[first][second] exists, use that distribution
- # if trigrams[first] exists, use bigram distribution
- # if none exist, use unigram distribution
- if first in trigrams:
- if second in trigrams[first]: # trigrams[first][second]
- tempKeyDist = []
- tempValDist = []
- for key, val in trigrams[first][second].items():
- tempKeyDist.append(key)
- tempValDist.append(val)
- third = r.choices(tempKeyDist, tempValDist)[0]
- else: # trigrams[first]
- third = generateBigram(second, unikeys, univals)
- else:
- third = generateUnigram(unikeys, univals)
- return third
- '''
- Generates a single word based on the bigram model.
- IN -- prev The first word prior to the one being generated.
- unikeys A list of unigram keys. (for generateUnigram)
- univals A list of unigram values. (for generateUnigram)
- OUT -- prev The generated word.
- '''
- def generateBigram(prev, unikeys, univals):
- if prev in bigrams:
- tempKeyDist = []
- tempValDist = []
- # format the distribution nicely for r.choices
- for key, val in bigrams[prev].items():
- tempKeyDist.append(key)
- tempValDist.append(val)
- prev = r.choices(tempKeyDist, tempValDist)[0]
- else:
- prev = generateUnigram(unikeys, univals)
- return prev
- '''
- Generates a single word based on the unigram model.
- IN -- keys A list of unigram keys.
- vals A list of unigram values.
- OUT -- prev The generated word.
- '''
- def generateUnigram(keys, vals):
- prev = r.choices(keys, vals)[0]
- return prev
- if __name__ == '__main__':
- # command line usage
- if len(sys.argv) is 4:
- corpusPath = sys.argv[1]
- sel = eval(sys.argv[2])
- num = eval(sys.argv[3])
- else:
- print("Usage: python3 rharris4_ngram.py [corpus path] [n-gram n (1-3)] [words to generate]")
- exit()
- try:
- with open(corpusPath, 'r') as f:
- corpus = f.readlines()
- except OSError:
- print("The file couldn't be opened... it does exist, right?")
- parsedCorpus = parse(corpus)
- unigrams = {}
- bigrams = {}
- trigrams = {}
- if sel is 3:
- unigrams = build(parsedCorpus, 1)
- bigrams = build(parsedCorpus, 2)
- trigrams = build(parsedCorpus, 3)
- elif sel is 2:
- unigrams = build(parsedCorpus, 1)
- bigrams = build(parsedCorpus, 2)
- elif sel is 1:
- unigrams = build(parsedCorpus, 1)
- else:
- print("This n-gram isn't supported... so sorry!")
- exit()
- while True:
- key = input("\nReady. Press any key to simulate. Enter q to quit: ")
- if key is 'q':
- exit()
- sim(unigrams, bigrams, trigrams, num)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement