Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.50 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[34]:
  5.  
  6.  
  7. import nltk
  8.  
  9. nltk.download('punkt')
  10. nltk.download('stopwords')
  11. nltk.download('wordnet')
  12. nltk.download('averaged_perceptron_tagger')
  13.  
  14.  
  15. # In[7]:
  16.  
  17.  
  18. with open('text.txt', 'r') as file:
  19.     content = file.read()
  20.  
  21.  
  22. # In[8]:
  23.  
  24.  
  25. tokenized_text = nltk.sent_tokenize(content)
  26. len(tokenized_text)
  27.  
  28.  
  29. # In[9]:
  30.  
  31.  
  32. tokenized_word = nltk.word_tokenize(content)
  33. len(tokenized_word)
  34.  
  35.  
  36. # In[12]:
  37.  
  38.  
  39. len(tokenized_word) / len(tokenized_text)
  40.  
  41.  
  42. # In[32]:
  43.  
  44.  
  45. stop_words = set(nltk.corpus.stopwords.words("english"))
  46.  
  47. filtered_text = []
  48.  
  49. for word in tokenized_word:
  50.     if word not in stop_words:
  51.         filtered_text.append(word)
  52.  
  53. pos = nltk.pos_tag(filtered_text)
  54.  
  55. from collections import Counter
  56. counts = Counter(tag for word, tag in pos)
  57.  
  58. # nouns
  59. print('NN: {}'.format(counts['NN']))
  60. # verbs
  61. verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  62. print('VB: {}'.format(sum(counts[tag] for tag in verb_tags)))
  63. # adjectives
  64. adj_tags = ['JJ', 'JJS', 'JJR']
  65. print('JJ: {}'.format(sum(counts[tag] for tag in adj_tags)))
  66. # adverbs
  67. adverb_tags = ['RB', 'RBS', 'RBR']
  68. print('RB: {}'.format(sum(counts[tag] for tag in adverb_tags)))
  69.  
  70.  
  71. # In[51]:
  72.  
  73.  
  74. lem = nltk.stem.wordnet.WordNetLemmatizer()
  75. lemmatized_words = []
  76.  
  77. for word, tag in pos:
  78.     if tag == 'NN' and word == lem.lemmatize(word):
  79.         lemmatized_words.append(word)
  80.  
  81. lem_counts = Counter(lemmatized_words)
  82. sorted(list(lem_counts.items()), key=lambda x: x[1], reverse=True)[: 5]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement