Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.96 KB | None | 0 0
  1. from __future__ import print_function
  2.  
  3. from glob import glob
  4. import itertools
  5. import os.path
  6. import re
  7. import tarfile
  8. import time
  9.  
  10. import numpy as np
  11. import matplotlib.pyplot as plt
  12. from matplotlib import rcParams
  13.  
  14. from sklearn.externals.six.moves import html_parser
  15. from sklearn.externals.six.moves.urllib.request import urlretrieve
  16. from sklearn.datasets import get_data_home
  17.  
  18.  
  19. from nltk import word_tokenize
  20. from nltk.stem.porter import PorterStemmer
  21. import re
  22. from nltk.corpus import stopwords
  23.  
  24. cachedStopWords = stopwords.words("english")
  25.  
  26. def tokenize(text):
  27.     min_length = 3
  28.     words = map(lambda word: word.lower(), word_tokenize(text));
  29.     words = [word for word in words
  30.                   if word not in cachedStopWords]
  31.     tokens =(list(map(lambda token: PorterStemmer().stem(token),
  32.                   words)));
  33.     p = re.compile('[a-zA-Z]+');
  34.     filtered_tokens = list(filter(lambda token:
  35.                   p.match(token) and len(token)>=min_length,
  36.          tokens));
  37.     return filtered_tokens
  38.  
  39.  
  40.  
  41. class ReutersParser(html_parser.HTMLParser):
  42.     """Utility class to parse a SGML file and yield documents one at a time."""
  43.  
  44.     def __init__(self, encoding='latin-1'):
  45.         html_parser.HTMLParser.__init__(self)
  46.         self._reset()
  47.         self.encoding = encoding
  48.  
  49.     def handle_starttag(self, tag, attrs):
  50.         method = 'start_' + tag
  51.         getattr(self, method, lambda x: None)(attrs)
  52.  
  53.     def handle_endtag(self, tag):
  54.         method = 'end_' + tag
  55.         getattr(self, method, lambda: None)()
  56.  
  57.     def _reset(self):
  58.         self.in_title = 0
  59.         self.in_body = 0
  60.         self.in_topics = 0
  61.         self.in_topic_d = 0
  62.         self.title = ""
  63.         self.body = ""
  64.         self.topics = []
  65.         self.topic_d = ""
  66.  
  67.     def parse(self, fd):
  68.         self.docs = []
  69.         for chunk in fd:
  70.             self.feed(chunk.decode(self.encoding))
  71.             for doc in self.docs:
  72.                 yield doc
  73.             self.docs = []
  74.         self.close()
  75.  
  76.     def handle_data(self, data):
  77.         if self.in_body:
  78.             self.body += data
  79.         elif self.in_title:
  80.             self.title += data
  81.         elif self.in_topic_d:
  82.             self.topic_d += data
  83.  
  84.     def start_reuters(self, attributes):
  85.         pass
  86.  
  87.     def end_reuters(self):
  88.         self.body = re.sub(r'\s+', r' ', self.body)
  89.         self.docs.append({'title': self.title,
  90.                           'body': tokenize(self.body),
  91.                           'body_raw': self.body,
  92.                           'topics': self.topics})
  93.         self._reset()
  94.  
  95.     def start_title(self, attributes):
  96.         self.in_title = 1
  97.  
  98.     def end_title(self):
  99.         self.in_title = 0
  100.  
  101.     def start_body(self, attributes):
  102.         self.in_body = 1
  103.  
  104.     def end_body(self):
  105.         self.in_body = 0
  106.  
  107.     def start_topics(self, attributes):
  108.         self.in_topics = 1
  109.  
  110.     def end_topics(self):
  111.         self.in_topics = 0
  112.  
  113.     def start_d(self, attributes):
  114.         self.in_topic_d = 1
  115.  
  116.     def end_d(self):
  117.         self.in_topic_d = 0
  118.         self.topics.append(self.topic_d)
  119.         self.topic_d = ""
  120.  
  121.  
  122. def stream_reuters_documents(data_path=None):
  123.     """Iterate over documents of the Reuters dataset.
  124.  
  125.    The Reuters archive will automatically be downloaded and uncompressed if
  126.    the `data_path` directory does not exist.
  127.  
  128.    Documents are represented as dictionaries with 'body' (str),
  129.    'title' (str), 'topics' (list(str)) keys.
  130.  
  131.    """
  132.  
  133.     DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
  134.                     'reuters21578-mld/reuters21578.tar.gz')
  135.     ARCHIVE_FILENAME = 'reuters21578.tar.gz'
  136.  
  137.     if data_path is None:
  138.         data_path = os.path.join(get_data_home(), "reuters")
  139.     if not os.path.exists(data_path):
  140.         """Download the dataset."""
  141.         print("downloading dataset (once and for all) into %s" %
  142.               data_path)
  143.         os.mkdir(data_path)
  144.  
  145.         def progress(blocknum, bs, size):
  146.             total_sz_mb = '%.2f MB' % (size / 1e6)
  147.             current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
  148.             if _not_in_sphinx():
  149.                 print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
  150.                       end='')
  151.  
  152.         archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
  153.         urlretrieve(DOWNLOAD_URL, filename=archive_path,
  154.                     reporthook=progress)
  155.         if _not_in_sphinx():
  156.             print('\r', end='')
  157.         print("untarring Reuters dataset...")
  158.         tarfile.open(archive_path, 'r:gz').extractall(data_path)
  159.         print("done.")
  160.  
  161.     parser = ReutersParser()
  162.     for filename in glob(os.path.join(data_path, "*.sgm")):
  163.         for doc in parser.parse(open(filename, 'rb')):
  164.             yield doc
  165.  
  166. from __future__ import print_function
  167.  
  168.  
  169. from sklearn.datasets import fetch_20newsgroups
  170. from sklearn.decomposition import TruncatedSVD
  171. from sklearn.feature_extraction.text import TfidfVectorizer
  172. from sklearn.feature_extraction.text import HashingVectorizer
  173. from sklearn.feature_extraction.text import TfidfTransformer
  174. from sklearn.pipeline import make_pipeline
  175. from sklearn.preprocessing import Normalizer
  176. from sklearn import metrics
  177.  
  178. from sklearn.cluster import KMeans
  179. from sklearn.cluster import DBSCAN
  180. from sklearn.cluster import AgglomerativeClustering
  181. import pickle
  182.  
  183.  
  184.  
  185. import logging
  186. from optparse import OptionParser
  187. import sys
  188. from time import time
  189.  
  190. import numpy as np
  191.  
  192.  
  193. reuters_texts = []
  194. reuters_labels = []
  195.  
  196. reuters_label_index = {}
  197. reuters_label_id = 0
  198.  
  199. def get_label_id(label):
  200.     if label not in reuters_label_index:
  201.         reuters_label_index[label] = reuters_label_id
  202.     return reuters_label_index[label]
  203.  
  204. data_path_texts = "reuters_texts"
  205. data_path_labels = "reuters_labels"
  206. data_path_labels_index = "reuters_labels.p"
  207.  
  208. #os.rmdir(os.path.join(get_data_home(), "reuters_data"))
  209.  
  210. if not os.path.exists(os.path.join(get_data_home(), "reuters_data")):
  211.     os.mkdir(os.path.join(get_data_home(), "reuters_data"))
  212.  
  213.     for doc in stream_reuters_documents():
  214.         reuters_texts.append(" ".join(doc['body']))
  215.         reuters_labels.append([get_label_id(label) for label in doc['topics']])
  216.  
  217.     pickle.dump( reuters_texts, open( os.path.join(get_data_home(), "reuters_data", data_path_texts), "wb" ) )
  218.     pickle.dump( reuters_labels, open( os.path.join(get_data_home(), "reuters_data", data_path_labels), "wb" ) )
  219.     pickle.dump( reuters_label_index, open( os.path.join(get_data_home(), "reuters_data", data_path_labels_index), "wb" ) )
  220. else:
  221.     reuters_texts = pickle.load( open( os.path.join(get_data_home(), "reuters_data", data_path_texts), "rb" ) )
  222.     reuters_labels = pickle.load( open( os.path.join(get_data_home(), "reuters_data", data_path_labels), "rb" ) )
  223.     reuters_label_index = pickle.load( open( os.path.join(get_data_home(), "reuters_data", data_path_labels_index), "rb" ) )
  224.  
  225. reuters_texts = reuters_texts[:2000]
  226. reuters_labels = reuters_labels[:2000]    
  227.  
  228. labels = reuters_labels
  229. true_k = len(reuters_label_index)
  230.  
  231.  
  232. print("Extracting features from the training dataset using a sparse vectorizer")
  233. t0 = time()
  234.  
  235.  
  236.  
  237.  
  238. vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
  239.                                  min_df=2, stop_words='english',
  240.                                  use_idf=True)
  241. X = vectorizer.fit_transform(reuters_texts)
  242.  
  243. print("done in %fs" % (time() - t0))
  244. print("n_samples: %d, n_features: %d" % X.shape)
  245. print()
  246.  
  247.  
  248. print("Performing dimensionality reduction using LSA")
  249. t0 = time()
  250. # Vectorizer results are normalized, which makes KMeans behave as
  251. # spherical k-means for better results. Since LSA/SVD results are
  252. # not normalized, we have to redo the normalization.
  253. svd = TruncatedSVD(300)
  254. normalizer = Normalizer(copy=False)
  255. lsa = make_pipeline(svd, normalizer)
  256.  
  257. X = lsa.fit_transform(X)
  258.  
  259. print("done in %fs" % (time() - t0))
  260.  
  261. explained_variance = svd.explained_variance_ratio_.sum()
  262. print("Explained variance of the SVD step: {}%".format(
  263.     int(explained_variance * 100)))
  264.  
  265. print()
  266.  
  267.  
  268. print("--------------------K-Means----------------------------------------")
  269.  
  270. km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
  271.                 verbose=False)
  272.  
  273. print("Clustering sparse data with %s" % km)
  274. t0 = time()
  275. km.fit(X)
  276. print("done in %0.3fs" % (time() - t0))
  277. print()
  278.  
  279. print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
  280. print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
  281. print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
  282. print("Adjusted Rand-Index: %.3f"
  283.       % metrics.adjusted_rand_score(labels, km.labels_))
  284. print("Silhouette Coefficient: %0.3f"
  285.       % metrics.silhouette_score(X, km.labels_, sample_size=1000))
  286.  
  287.  
  288. print("Top terms per cluster:")
  289. original_space_centroids = svd.inverse_transform(km.cluster_centers_)
  290. order_centroids = original_space_centroids.argsort()[:, ::-1]
  291. terms = vectorizer.get_feature_names()
  292. for i in range(true_k):
  293.     print("Cluster %d:" % i, end='')
  294.     for ind in order_centroids[i, :10]:
  295.         print(' %s' % terms[ind], end='')
  296.     print()
  297.    
  298.  
  299.  
  300. print("--------------------DBSCAN----------------------------------------")
  301.  
  302. db = DBSCAN(eps=0.85, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1)
  303.  
  304. print("Clustering sparse data with %s" % db)
  305. t0 = time()
  306. db.fit(X)
  307. print("done in %0.3fs" % (time() - t0))
  308. print()
  309.  
  310. print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, db.labels_))
  311. print("Completeness: %0.3f" % metrics.completeness_score(labels, db.labels_))
  312. print("V-measure: %0.3f" % metrics.v_measure_score(labels, db.labels_))
  313. print("Adjusted Rand-Index: %.3f"
  314.       % metrics.adjusted_rand_score(labels, db.labels_))
  315. print("Silhouette Coefficient: %0.3f"
  316.       % metrics.silhouette_score(X, db.labels_, sample_size=1000))
  317.  
  318.  
  319. print("Top terms per cluster:")
  320. original_space_centroids = svd.inverse_transform(db.components_)
  321. order_centroids = original_space_centroids.argsort()[:, ::-1]
  322. terms = vectorizer.get_feature_names()
  323. for i in range(len(order_centroids)):
  324.     print("Cluster %d:" % i, end='')
  325.     for ind in order_centroids[i, :10]:
  326.         print(' %s' % terms[ind], end='')
  327.     print()
  328.  
  329. print("--------------------hierarch----------------------------------------")
  330.  
  331. db = AgglomerativeClustering(n_clusters=true_k, affinity='euclidean', linkage='ward')
  332.  
  333. print("Clustering sparse data with %s" % db)
  334. t0 = time()
  335. db.fit(X)
  336. print("done in %0.3fs" % (time() - t0))
  337. print()
  338.  
  339. print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, db.labels_))
  340. print("Completeness: %0.3f" % metrics.completeness_score(labels, db.labels_))
  341. print("V-measure: %0.3f" % metrics.v_measure_score(labels, db.labels_))
  342. print("Adjusted Rand-Index: %.3f"
  343.       % metrics.adjusted_rand_score(labels, db.labels_))
  344. print("Silhouette Coefficient: %0.3f"
  345.       % metrics.silhouette_score(X, db.labels_, sample_size=1000))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement