Advertisement
Guest User

Untitled

a guest
Jul 16th, 2019
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.93 KB | None | 0 0
  1. import nltk
  2. import operator
  3.  
  4. # Reading text
  5. textfile = open('testimony.txt', 'r')
  6. all_text = textfile.read()
  7. page_1 = text.split('\n\n\n')[1] # Selecting the first page
  8.  
  9. # Stopwords List
  10. stopwords = nltk.corpus.stopwords.words('french')
  11. stopwords.extend(['', '-', ':', 'Il', 'Nous', 'a', 'donc', 'comme', 'cette',
  12. 'ils', 'les', 'plus', "j'ai", 'donc'
  13. 'En', '+', "c'est", 'après', 'Le', '|', 'vers',
  14. "qu'il", 'tous', 'tout', 'dont', 'peu', 'En', "C'était"])
  15.  
  16. # Words Frequencies on All Text
  17. word_list = all_text.split(' ')
  18.  
  19. word_frequencies = {}
  20. for w in word_list:
  21. if w not in stopwords:
  22. word_frequencies[w] = word_list.count(w)
  23.  
  24. most_frequent_words = dict(sorted(word_frequencies.items(),
  25. key=operator.itemgetter(1),
  26. reverse=True)[:15]
  27. )
  28. print(most_frequent_words.keys())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement