Untitled

'''
CPSC 420 -- Program #0
Ryan Harris, University of Mary Washington

This program builds n-gram models, then generates simulated samples based
on the models.
'''
import sys
import random as r

PERMITTED_CHARS = [' ', '-', '\'', ',', '.']

'''
Removes non-alphanumeric characters from a given corpus.
IN  -- Corpus to parse.
OUT -- Parsed corpus.
'''
def parse(corpus):
    parsed = list()

    for line in corpus:
        # for each character in the line, join together all permitted characters
        newLine = ''.join(ch for ch in line if ch.isalnum() or ch in PERMITTED_CHARS)
        parsed.append(newLine)
    parsedCorpus = ' '.join(line for line in parsed)
    parsedCorpus = parsedCorpus.lower()
    return parsedCorpus
'''
Gets n-grams from a given text.
IN  --  text        The text to analyze.
        n           The n in n-gram, AKA the size of phrases to get.
OUT --  ngrams      A list of n-grams from the text.
'''
def build(text, n):
    split = text.split()
    ngrams = []
    count  = {}

    # unigrams
    if n is 1:
        for i in range(0, len(split)):
            if split[i] in count:
                count[split[i]] += 1
            else:
                count[split[i]] = 1

    # bigrams
    elif n is 2:
        for i in range(0, len(split)-1):
            ngrams.append(split[i] + ' ' + split[i+1])

        for word in ngrams:
            split = word.split()

            if split[0] in count:
                if split[1] in count[split[0]]:
                    count[split[0]][split[1]] += 1
                else:
                    count[split[0]][split[1]] = 1
            else:
                count[split[0]] = {}
                count[split[0]][split[1]] = 1

    # trigrams
    elif n is 3:
        for i in range(0, len(split)-2):
            ngrams.append(split[i] + ' ' + split[i+1] + ' ' + split[i+2])

        for word in ngrams:
            split = word.split()

            if split[0] in count:
                if split[1] in count[split[0]]:
                    if split[2] in count[split[0]][split[1]]:
                        count[split[0]][split[1]][split[2]] += 1
                    else:
                        count[split[0]][split[1]][split[2]] = 1
                else:
                    count[split[0]][split[1]] = {}
                    count[split[0]][split[1]][split[2]] = 1
            else:
                count[split[0]] = {}
                count[split[0]][split[1]] = {}
                count[split[0]][split[1]][split[2]] = 1


    print(count)
    return count

'''
Simulates a n-word text from a given dictionary of n-grams.
IN  --  unigrams    A dictionary of unigrams and counts.
        bigrams     A dictionary of bigrams and counts.
        trigrams    A dictionary of trigrams and counts.
        n           Number of words to generate.
OUT --  None
'''
def sim(unigrams, bigrams, trigrams, n):
    print('Simulating ', end='')
    gen = str()
    unikeys = list(unigrams.keys())
    univals = list(unigrams.values())

    if len(trigrams):
        print("trigrams...")

        first = str()
        second = str()
        for i in range(0, n):
            if i is 0:
                first = generateUnigram(unikeys, univals)
                gen += first + ' '
            if i is 1:
                second = generateBigram(first, unikeys, univals)
                gen += second + ' '

            else:
                third = generateTrigram(first, second, unikeys, univals)
                gen += third + ' '

                # shift each word up per iteration
                tmp = second
                second = third
                first = tmp

    elif len(bigrams):
        print("bigrams...")

        prev = str()
        for i in range(0, n):
            # always start with unigram model
            if i is 0:
                prev = r.choices(unikeys, univals)[0]
                gen += prev + ' '
            else:
                prev = generateBigram(prev, unikeys, univals)
                gen += prev + ' '

    elif len(unigrams):
        print('unigrams...')

        for i in range(0, n):
            gen += generateUnigram(unikeys, univals) + ' '

    print(gen)

'''
Generates a single word based on the trigram model.
IN  --  first       The second word prior to the one being generated.
        second      The first word prior to the one being generated.
        unikeys     A list of unigram keys. (for generateUnigram)
        univals     A list of unigram values. (for generateUnigram)
OUT --  third       The generated word.
'''
def generateTrigram(first, second, unikeys, univals):
    # if trigrams[first][second] exists, use that distribution
    # if trigrams[first] exists, use bigram distribution
    # if none exist, use unigram distribution
    if first in trigrams:
        if second in trigrams[first]:   # trigrams[first][second]
            tempKeyDist = []
            tempValDist = []
            for key, val in trigrams[first][second].items():
                tempKeyDist.append(key)
                tempValDist.append(val)
            third = r.choices(tempKeyDist, tempValDist)[0]

        else:       # trigrams[first]
            third = generateBigram(second, unikeys, univals)

    else:
        third = generateUnigram(unikeys, univals)

    return third

'''
Generates a single word based on the bigram model.
IN  --  prev        The first word prior to the one being generated.
        unikeys     A list of unigram keys. (for generateUnigram)
        univals     A list of unigram values. (for generateUnigram)
OUT --  prev        The generated word.
'''
def generateBigram(prev, unikeys, univals):
    if prev in bigrams:
        tempKeyDist = []
        tempValDist = []
        # format the distribution nicely for r.choices
        for key, val in bigrams[prev].items():
            tempKeyDist.append(key)
            tempValDist.append(val)

        prev = r.choices(tempKeyDist, tempValDist)[0]
    else:
        prev = generateUnigram(unikeys, univals)

    return prev

'''
Generates a single word based on the unigram model.
IN  --  keys        A list of unigram keys.
        vals        A list of unigram values.
OUT --  prev       The generated word.
'''
def generateUnigram(keys, vals):
    prev = r.choices(keys, vals)[0]
    return prev


if __name__ == '__main__':

    # command line usage
    if len(sys.argv) is 4:
        corpusPath = sys.argv[1]
        sel = eval(sys.argv[2])
        num = eval(sys.argv[3])
    else:
        print("Usage: python3 rharris4_ngram.py [corpus path] [n-gram n (1-3)] [words to generate]")
        exit()

    try:
        with open(corpusPath, 'r') as f:
            corpus = f.readlines()
    except OSError:
        print("The file couldn't be opened... it does exist, right?")

    parsedCorpus = parse(corpus)
    unigrams = {}
    bigrams = {}
    trigrams = {}

    if sel is 3:
        unigrams = build(parsedCorpus, 1)
        bigrams = build(parsedCorpus, 2)
        trigrams = build(parsedCorpus, 3)
    elif sel is 2:
        unigrams = build(parsedCorpus, 1)
        bigrams = build(parsedCorpus, 2)
    elif sel is 1:
        unigrams = build(parsedCorpus, 1)
    else:
        print("This n-gram isn't supported... so sorry!")
        exit()


    while True:
        key = input("\nReady.  Press any key to simulate.  Enter q to quit: ")

        if key is 'q':
            exit()
        sim(unigrams, bigrams, trigrams, num)