Advertisement
Guest User

Untitled

a guest
Jul 4th, 2015
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.56 KB | None | 0 0
  1. import sys
  2.  
  3. import numpy
  4. from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
  5. import nltk.corpus
  6. from nltk import decorators
  7. import nltk.stem
  8.  
  9. stemmer_func = nltk.stem.EnglishStemmer().stem
  10. stopwords = set(nltk.corpus.stopwords.words('english'))
  11.  
  12. @decorators.memoize
  13. def normalize_word(word):
  14. return stemmer_func(word.lower())
  15.  
  16. def get_words(titles):
  17. words = set()
  18. for title in job_titles:
  19. for word in title.split():
  20. words.add(normalize_word(word))
  21. return list(words)
  22.  
  23. @decorators.memoize
  24. def vectorspaced(title):
  25. title_components = [normalize_word(word) for word in title.split()]
  26. return numpy.array([
  27. word in title_components and not word in stopwords
  28. for word in words], numpy.short)
  29.  
  30. if __name__ == '__main__':
  31.  
  32. filename = 'example.txt'
  33. if len(sys.argv) == 2:
  34. filename = sys.argv[1]
  35.  
  36. with open(filename) as title_file:
  37.  
  38. job_titles = [line.strip() for line in title_file.readlines()]
  39.  
  40. words = get_words(job_titles)
  41.  
  42. # cluster = KMeansClusterer(5, euclidean_distance)
  43. cluster = GAAClusterer(5)
  44. cluster.cluster([vectorspaced(title) for title in job_titles if title])
  45.  
  46. # NOTE: This is inefficient, cluster.classify should really just be
  47. # called when you are classifying previously unseen examples!
  48. classified_examples = [
  49. cluster.classify(vectorspaced(title)) for title in job_titles
  50. ]
  51.  
  52. for cluster_id, title in sorted(zip(classified_examples, job_titles)):
  53. print cluster_id, title
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement