Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- import operator
- # Reading text
- textfile = open('testimony.txt', 'r')
- all_text = textfile.read()
- page_1 = text.split('\n\n\n')[1] # Selecting the first page
- # Stopwords List
- stopwords = nltk.corpus.stopwords.words('french')
- stopwords.extend(['', '-', ':', 'Il', 'Nous', 'a', 'donc', 'comme', 'cette',
- 'ils', 'les', 'plus', "j'ai", 'donc'
- 'En', '+', "c'est", 'après', 'Le', '|', 'vers',
- "qu'il", 'tous', 'tout', 'dont', 'peu', 'En', "C'était"])
- # Words Frequencies on All Text
- word_list = all_text.split(' ')
- word_frequencies = {}
- for w in word_list:
- if w not in stopwords:
- word_frequencies[w] = word_list.count(w)
- most_frequent_words = dict(sorted(word_frequencies.items(),
- key=operator.itemgetter(1),
- reverse=True)[:15]
- )
- print(most_frequent_words.keys())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement