Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.corpus import inaugural, stopwords
- from nltk import FreqDist, ConditionalFreqDist
- import re
- #Section 2, Task 1: Produce a list of overlapping lists of inaugural addresses
- #Add script for inaug20 here:
- inaug20 = []
- m=(-1)
- n=4
- for i in range(len(inaugural.fileids())-4):
- inaugural.fileids()[m:n]
- n+=1
- m+=1
- inaug20.append(inaugural.fileids()[m:n])
- #Section 2, Task 2: Frequency distribution of words, excluding fn words and punct
- #Add script for word_fdist here:
- # notwords = ['!','"','%','(',')',',',':',';','.','?']
- # stopword = stopwords.words('english')
- import string
- def word_fdist(inaug_list):
- freq = []
- dashdash = ["--"]
- eng_stopwords = stopwords.words('english')
- for s in inaug_list:
- words_no_punc = [w for w in inaugural.words(s)
- if w not in string.punctuation and w not in dashdash and w.lower() not in eng_stopwords]
- freq += (words_no_punc)
- return FreqDist(freq)
- #Section 2, Task 2: Also compute and print the 20 most common words in each
- #of the overlapping 20-year periods. Add script for print_most_common here:
- def print_most_common():
- m = 0
- for i in range(len(inaugural.fileids())-4):
- print FreqDist.items(word_fdist(inaug20[m]))[:20]
- print "\n"
- m += 1
- #Section 2, Task 3: Frequency distribution of sentence lengths,
- #excluding stopwords and punctuation. Add script for set_length_fdist here:
- def sent_length_fdist(inaug_list):
- freq = []
- dashdash = ["--"]
- for s in inaug_list:
- sents_no_punc = [w for s in inaugural.words(s) for w in s
- if w not in string.punctuation and w not in dashdash]
- for q in range(len(inaugural.sents())):
- print len(inaugural.sents()[q])
- #return FreqDist(freq)
- #def sent_length_fdist(inaug_list):
- # dashdash = ["--"]
- # for i in range(len(inaugural.sents())):
- # sents_no_punc = [w for s in inaugural.sents() for w in s
- # if w not in string.punctuation and w not in dashdash]
- # print (len(inaugural.sents()))
- #Section 2, Task 3: Also compute and print the average sentence lengths in each of
- #the overlapping 20-year periods. Add script for print_average_lengths here:
- #Section 2, Task 4: Conditional freq distribution of words following 'I'/'my' or
- #preceding 'me', plus printing samples that occur >1 for each pro-period pair.
- #Add your script for build_cond_fdist here:
- #Section 2, Task 4: Also compute and print, for each pronoun and each 20-year
- #period, the list of words accompanying the pronoun more than once in the
- #addresses within the period. Add your script for print_Imyme_words here:
- #End of file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement