from __future__ import division
import nltk, re, pprint, random
f = open(\'CSCI3651-Fall2011.txt\')
text = f.readlines()
text = "".join(text)
lines = re.split(r\'\\r\',text)
statements = [re.findall(r\'\\[.+\\] \\w+,?\\s?\\w+?\\.?: (.+)\',line) for line in lines]
# checking for anything the regex has missed
print [index for index, statement in enumerate(statements) if statement == ""]
print [index for index, statement in enumerate(statements) if statement == None]
print [index for index, statement in enumerate(statements) if statement == []]
raw = " ".join([statement[0] for statement in statements])
text = raw.split(" ")
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
def probMap(fdist):
array = []
for key, value in fdist.iteritems():
for r in range(value):
array += [key]
return array
# would be nice to work out how to choose as a function of freq,
# had hoped that might be in ConditionalFreqDist (in probability.py)
# but can\'t see anything obvious cfd._fdists is dict of words to a FreqDist object
def generate_model(cfdist, word, num=15):
for i in range(num):
print word,
word = random.choice(probMap(cfdist[word]))
generate_model(cfd,\'I\',35)