Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_parts_of_speech(prod_desc):
- tokenizer = RegexpTokenizer(r'\w+')
- tokens = tokenizer.tokenize(prod_desc)
- text = nltk.Text(tokens)
- tagged = nltk.pos_tag(text, tagset='universal')
- counts = Counter(tag for word,tag in tagged)
- for word, tag in tagged:
- counts[tag] += 1
- total = sum(counts.values())
- counts_norm = dict((word, float(count)/total) for word,count in counts.items())
- return counts_norm
Add Comment
Please, Sign In to add comment