Advertisement
Guest User

Untitled

a guest
Apr 18th, 2015
189
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.90 KB | None | 0 0
  1. import nltk
  2. import sys
  3. import string
  4. import re
  5. text = open('test.txt').read()
  6. text = ' '.join(re.findall('[%s]+' % string.ascii_letters, text))
  7.  
  8. # Used when tokenizing words
  9. sentence_re = r'''(?x) # set flag to allow verbose regexps
  10. ([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
  11. | \w+(-\w+)* # words with optional internal hyphens
  12. | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
  13. | \.\.\. # ellipsis
  14. | [][.,;"'?():-_`] # these are separate tokens
  15. '''
  16.  
  17. lemmatizer = nltk.WordNetLemmatizer()
  18. stemmer = nltk.stem.porter.PorterStemmer()
  19.  
  20. #Taken from Su Nam Kim Paper...
  21. grammar = r"""
  22. NBAR:
  23. {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
  24.  
  25. NP:
  26. {<NBAR>}
  27. {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
  28. """
  29. chunker = nltk.RegexpParser(grammar)
  30.  
  31. toks = nltk.regexp_tokenize(text, sentence_re)
  32. postoks = nltk.tag.pos_tag(toks)
  33.  
  34.  
  35. tree = chunker.parse(postoks)
  36.  
  37. from nltk.corpus import stopwords
  38. stopwords = stopwords.words('english')
  39.  
  40.  
  41. def leaves(tree):
  42. """Finds NP (nounphrase) leaf nodes of a chunk tree."""
  43. for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
  44. yield subtree.leaves()
  45.  
  46. def normalise(word):
  47. """Normalises words to lowercase and stems and lemmatizes it."""
  48. word = word.lower()
  49. #word = stemmer.stem_word(word)
  50. word = lemmatizer.lemmatize(word)
  51. return word
  52.  
  53. def acceptable_word(word):
  54. """Checks conditions for acceptable word: length, stopword."""
  55. accepted = bool(2 <= len(word) <= 40
  56. and word.lower() not in stopwords)
  57. return accepted
  58.  
  59.  
  60. def get_terms(tree):
  61. for leaf in leaves(tree):
  62. term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
  63. yield term
  64.  
  65. terms = get_terms(tree)
  66.  
  67. from collections import Counter
  68. c = Counter([word[0] for word in list(terms) if len(word)])
  69. print c.most_common(5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement