Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[34]:
- import nltk
- nltk.download('punkt')
- nltk.download('stopwords')
- nltk.download('wordnet')
- nltk.download('averaged_perceptron_tagger')
- # In[7]:
- with open('text.txt', 'r') as file:
- content = file.read()
- # In[8]:
- tokenized_text = nltk.sent_tokenize(content)
- len(tokenized_text)
- # In[9]:
- tokenized_word = nltk.word_tokenize(content)
- len(tokenized_word)
- # In[12]:
- len(tokenized_word) / len(tokenized_text)
- # In[32]:
- stop_words = set(nltk.corpus.stopwords.words("english"))
- filtered_text = []
- for word in tokenized_word:
- if word not in stop_words:
- filtered_text.append(word)
- pos = nltk.pos_tag(filtered_text)
- from collections import Counter
- counts = Counter(tag for word, tag in pos)
- # nouns
- print('NN: {}'.format(counts['NN']))
- # verbs
- verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
- print('VB: {}'.format(sum(counts[tag] for tag in verb_tags)))
- # adjectives
- adj_tags = ['JJ', 'JJS', 'JJR']
- print('JJ: {}'.format(sum(counts[tag] for tag in adj_tags)))
- # adverbs
- adverb_tags = ['RB', 'RBS', 'RBR']
- print('RB: {}'.format(sum(counts[tag] for tag in adverb_tags)))
- # In[51]:
- lem = nltk.stem.wordnet.WordNetLemmatizer()
- lemmatized_words = []
- for word, tag in pos:
- if tag == 'NN' and word == lem.lemmatize(word):
- lemmatized_words.append(word)
- lem_counts = Counter(lemmatized_words)
- sorted(list(lem_counts.items()), key=lambda x: x[1], reverse=True)[: 5]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement