Untitled

from nltk.corpus import inaugural, stopwords
from nltk import FreqDist, ConditionalFreqDist
import re

#Section 2, Task 1: Produce a list of overlapping lists of inaugural addresses
#Add script for inaug20 here:

inaug20 = []
m=(-1)
n=4
for i in range(len(inaugural.fileids())-4):
	inaugural.fileids()[m:n]
	n+=1
	m+=1
	inaug20.append(inaugural.fileids()[m:n])

#Section 2, Task 2: Frequency distribution of words, excluding fn words and punct
#Add script for word_fdist here:


#	notwords = ['!','"','%','(',')',',',':',';','.','?']
#	stopword = stopwords.words('english')

import string

def word_fdist(inaug_list):
	freq = []
	dashdash = ["--"]
	eng_stopwords = stopwords.words('english')
	for s in inaug_list:
		words_no_punc = [w for w in inaugural.words(s)
			if w not in string.punctuation and w not in dashdash and w.lower() not in eng_stopwords]
		freq += (words_no_punc)
	return FreqDist(freq)

#Section 2, Task 2: Also compute and print the 20 most common words in each
#of the overlapping 20-year periods. Add script for print_most_common here:

def print_most_common():
	m = 0
	for i in range(len(inaugural.fileids())-4):
		print FreqDist.items(word_fdist(inaug20[m]))[:20]
		print "\n"
		m += 1


#Section 2, Task 3: Frequency distribution of sentence lengths,
#excluding stopwords and punctuation. Add script for set_length_fdist here:

def sent_length_fdist(inaug_list):
	freq = []
	dashdash = ["--"]
	for s in inaug_list:
		sents_no_punc = [w for s in inaugural.words(s) for w in s
			if w not in string.punctuation and w not in dashdash]
	for q in range(len(inaugural.sents())):
		print len(inaugural.sents()[q])
	#return FreqDist(freq)

#def sent_length_fdist(inaug_list):
#	dashdash = ["--"]
#	for i in range(len(inaugural.sents())):
#	sents_no_punc = [w for s in inaugural.sents() for w in s
#		if w not in string.punctuation and w not in dashdash]
#	print (len(inaugural.sents()))


#Section 2, Task 3: Also compute and print the average sentence lengths in each of
#the overlapping 20-year periods. Add script for print_average_lengths here:


#Section 2, Task 4: Conditional freq distribution of words following 'I'/'my' or
#preceding 'me', plus printing samples that occur >1 for each pro-period pair.
#Add your script for build_cond_fdist here:


#Section 2, Task 4: Also compute and print, for each pronoun and each 20-year
#period, the list of words accompanying the pronoun more than once in the
#addresses within the period. Add your script for print_Imyme_words here:


#End of file