SHARE
TWEET

Untitled

a guest Jul 16th, 2019 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import nltk
  2. import operator
  3.  
  4. # Reading text
  5. textfile = open('testimony.txt', 'r')
  6. all_text = textfile.read()
  7. page_1 = text.split('\n\n\n')[1] # Selecting the first page
  8.  
  9. # Stopwords List
  10. stopwords = nltk.corpus.stopwords.words('french')
  11. stopwords.extend(['', '-', ':', 'Il', 'Nous', 'a', 'donc', 'comme', 'cette',
  12.                   'ils', 'les', 'plus', "j'ai", 'donc'
  13.                   'En', '+', "c'est", 'après', 'Le', '|', 'vers',
  14.                   "qu'il", 'tous', 'tout', 'dont', 'peu', 'En', "C'était"])
  15.                  
  16. # Words Frequencies on All Text
  17. word_list = all_text.split(' ')
  18.  
  19. word_frequencies = {}  
  20. for w in word_list:
  21.     if w not in stopwords:
  22.         word_frequencies[w] = word_list.count(w)
  23.  
  24. most_frequent_words = dict(sorted(word_frequencies.items(),
  25.                                   key=operator.itemgetter(1),
  26.                                   reverse=True)[:15]
  27.                            )
  28. print(most_frequent_words.keys())
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top