Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- import sys
- import string
- import re
- text = open('test.txt').read()
- text = ' '.join(re.findall('[%s]+' % string.ascii_letters, text))
- # Used when tokenizing words
- sentence_re = r'''(?x) # set flag to allow verbose regexps
- ([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
- | \w+(-\w+)* # words with optional internal hyphens
- | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
- | \.\.\. # ellipsis
- | [][.,;"'?():-_`] # these are separate tokens
- '''
- lemmatizer = nltk.WordNetLemmatizer()
- stemmer = nltk.stem.porter.PorterStemmer()
- #Taken from Su Nam Kim Paper...
- grammar = r"""
- NBAR:
- {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
- NP:
- {<NBAR>}
- {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
- """
- chunker = nltk.RegexpParser(grammar)
- toks = nltk.regexp_tokenize(text, sentence_re)
- postoks = nltk.tag.pos_tag(toks)
- tree = chunker.parse(postoks)
- from nltk.corpus import stopwords
- stopwords = stopwords.words('english')
- def leaves(tree):
- """Finds NP (nounphrase) leaf nodes of a chunk tree."""
- for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
- yield subtree.leaves()
- def normalise(word):
- """Normalises words to lowercase and stems and lemmatizes it."""
- word = word.lower()
- #word = stemmer.stem_word(word)
- word = lemmatizer.lemmatize(word)
- return word
- def acceptable_word(word):
- """Checks conditions for acceptable word: length, stopword."""
- accepted = bool(2 <= len(word) <= 40
- and word.lower() not in stopwords)
- return accepted
- def get_terms(tree):
- for leaf in leaves(tree):
- term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
- yield term
- terms = get_terms(tree)
- from collections import Counter
- c = Counter([word[0] for word in list(terms) if len(word)])
- print c.most_common(5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement