from __future__ import division
import nltk, re, pprint, random

f = open(\'CSCI3651-Fall2011.txt\')
text = f.readlines()
text = "".join(text)
lines = re.split(r\'\\r\',text)
statements = [re.findall(r\'\\[.+\\] \\w+,?\\s?\\w+?\\.?: (.+)\',line) for line in lines]

# checking for anything the regex has missed
print [index for index, statement in enumerate(statements) if statement == ""]
print [index for index, statement in enumerate(statements) if statement == None]
print [index for index, statement in enumerate(statements) if statement == []]

raw = " ".join([statement[0] for statement in statements])
text = raw.split(" ")
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

def probMap(fdist):
  array = []
  for key, value in fdist.iteritems():
    for r in range(value):
      array += [key]
  return array


# would be nice to work out how to choose as a function of freq,
# had hoped that might be in ConditionalFreqDist (in probability.py)
# but can\'t see anything obvious  cfd._fdists is dict of words to a FreqDist object
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = random.choice(probMap(cfdist[word]))


generate_model(cfd,\'I\',35)