Advertisement
Guest User

Untitled

a guest
Jun 24th, 2017
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.55 KB | None | 0 0
  1. from nltk.corpus import inaugural, stopwords
  2. from nltk import FreqDist, ConditionalFreqDist
  3. import re
  4.  
  5. #Section 2, Task 1: Produce a list of overlapping lists of inaugural addresses
  6. #Add script for inaug20 here:
  7.  
  8. inaug20 = []
  9. m=(-1)
  10. n=4
  11. for i in range(len(inaugural.fileids())-4):
  12. inaugural.fileids()[m:n]
  13. n+=1
  14. m+=1
  15. inaug20.append(inaugural.fileids()[m:n])
  16.  
  17. #Section 2, Task 2: Frequency distribution of words, excluding fn words and punct
  18. #Add script for word_fdist here:
  19.  
  20.  
  21. # notwords = ['!','"','%','(',')',',',':',';','.','?']
  22. # stopword = stopwords.words('english')
  23.  
  24. import string
  25.  
  26. def word_fdist(inaug_list):
  27. freq = []
  28. dashdash = ["--"]
  29. eng_stopwords = stopwords.words('english')
  30. for s in inaug_list:
  31. words_no_punc = [w for w in inaugural.words(s)
  32. if w not in string.punctuation and w not in dashdash and w.lower() not in eng_stopwords]
  33. freq += (words_no_punc)
  34. return FreqDist(freq)
  35.  
  36. #Section 2, Task 2: Also compute and print the 20 most common words in each
  37. #of the overlapping 20-year periods. Add script for print_most_common here:
  38.  
  39. def print_most_common():
  40. m = 0
  41. for i in range(len(inaugural.fileids())-4):
  42. print FreqDist.items(word_fdist(inaug20[m]))[:20]
  43. print "\n"
  44. m += 1
  45.  
  46.  
  47.  
  48. #Section 2, Task 3: Frequency distribution of sentence lengths,
  49. #excluding stopwords and punctuation. Add script for set_length_fdist here:
  50.  
  51. def sent_length_fdist(inaug_list):
  52. freq = []
  53. dashdash = ["--"]
  54. for s in inaug_list:
  55. sents_no_punc = [w for s in inaugural.words(s) for w in s
  56. if w not in string.punctuation and w not in dashdash]
  57. for q in range(len(inaugural.sents())):
  58. print len(inaugural.sents()[q])
  59. #return FreqDist(freq)
  60.  
  61. #def sent_length_fdist(inaug_list):
  62. # dashdash = ["--"]
  63. # for i in range(len(inaugural.sents())):
  64. # sents_no_punc = [w for s in inaugural.sents() for w in s
  65. # if w not in string.punctuation and w not in dashdash]
  66. # print (len(inaugural.sents()))
  67.  
  68.  
  69. #Section 2, Task 3: Also compute and print the average sentence lengths in each of
  70. #the overlapping 20-year periods. Add script for print_average_lengths here:
  71.  
  72.  
  73. #Section 2, Task 4: Conditional freq distribution of words following 'I'/'my' or
  74. #preceding 'me', plus printing samples that occur >1 for each pro-period pair.
  75. #Add your script for build_cond_fdist here:
  76.  
  77.  
  78. #Section 2, Task 4: Also compute and print, for each pronoun and each 20-year
  79. #period, the list of words accompanying the pronoun more than once in the
  80. #addresses within the period. Add your script for print_Imyme_words here:
  81.  
  82.  
  83. #End of file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement