document.write('
Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. from __future__ import division
  2. import nltk, re, pprint, random
  3.  
  4. f = open(\'CSCI3651-Fall2011.txt\')
  5. text = f.readlines()
  6. text = "".join(text)
  7. lines = re.split(r\'\\r\',text)
  8. statements = [re.findall(r\'\\[.+\\] \\w+,?\\s?\\w+?\\.?: (.+)\',line) for line in lines]
  9.  
  10. # checking for anything the regex has missed
  11. print [index for index, statement in enumerate(statements) if statement == ""]
  12. print [index for index, statement in enumerate(statements) if statement == None]
  13. print [index for index, statement in enumerate(statements) if statement == []]
  14.  
  15. raw = " ".join([statement[0] for statement in statements])
  16. text = raw.split(" ")
  17. bigrams = nltk.bigrams(text)
  18. cfd = nltk.ConditionalFreqDist(bigrams)
  19.  
  20. def probMap(fdist):
  21.   array = []
  22.   for key, value in fdist.iteritems():
  23.     for r in range(value):
  24.       array += [key]
  25.   return array
  26.  
  27.  
  28. # would be nice to work out how to choose as a function of freq,
  29. # had hoped that might be in ConditionalFreqDist (in probability.py)
  30. # but can\'t see anything obvious  cfd._fdists is dict of words to a FreqDist object
  31. def generate_model(cfdist, word, num=15):
  32.     for i in range(num):
  33.         print word,
  34.         word = random.choice(probMap(cfdist[word]))
  35.        
  36.        
  37. generate_model(cfd,\'I\',35)
');