View difference between Paste ID: <a href="/uyFKjQLG">uyFKjQLG</a> and <a href="/R9s9XPdy">R9s9XPdy</a>

import click
1		import click
2
3		from sklearn.cluster import KMeans
4		from sklearn import metrics
5
6		from scipy.linalg import svd
7
8		import numpy as np
9		import pandas as pd
10
11
12		@click.command()
13		@click.argument('input-file', help='Name of the input file. Should be stored as a Tab Separated File')
14		@click.option('--output-file', default=None, help='Output results to a file instead of printing results')
15		@click.option('--n-clusters', default=3, type=int, help='The number of KMeans clusters')
16		@click.option('--pca', is_flag=True, help='Should we use PCA precprocessing of the data?')
17		@click.option('--pca-min-variance', default=1.0, type=float,
18		help='The amount of variance to perserve if using PCA. Ignored if not used with --pca flag')
19		@click.option('--evaluate-clusters', is_flag=True, help='Should we evaluate cluster quality after clustering?')
20		@click.option('--verbose', is_flag=True, help='Print progress')
21		def run_clustering(input_file, output_file, n_clusters, pca, pca_min_variance, evaluate_clusters, verbose):
22		"""Run KMeans Clustering on an input TSV File with Optional PCA"""
23		input_data = pd.read_table(input_file)
24
25		X = np.array(input_data)
26		if pca:
27		if verbose:
28		print 'Fitting PCA'
29
30		X = pca(X, pca_min_variance, verbose)
31
32		if verbose:
33		print 'Fitting K-Means with %s clusters' % n_clusters
34
35		kmeans = KMeans(n_clusters=n_clusters).fit(X)
36
37		clusters = kmeans.labels_
38
39		if evaluate_clusters:
40		if verbose:
41		print 'Calculating Calinski-Harabaz Index'
42
43		print 'Calinski-Harabaz Index: %s' % metrics.calinski_harabaz_score(X, clusters)
44
45		if output_file is not None:
46		input_data['clusters'] = clusters
47		input_data.to_csv(output_file, index=False, columns='clusters', sep='\t')
48
49		else:
50		print 'clusters'
51		for i in clusters:
52		print i
53
54
55		def pca(X, variance_explained=1., verbose=True):
56		"""Compute PCA on data matrix X"""
57		X = np.array(X)
58		m, n = X.shape
59		mu = X.mean(0)
60
61		Xn = X - mu
62
63		if verbose:
64		print 'Fitting covariance matrix'
65
66		XtX = Xn.T.dot(Xn) / m
67
68		if verbose:
69		print 'Computing SVD'
70
71		U, s, V = svd(XtX)
72
73		variance_explained_ratio = (s / s.sum()).cumsum()
74		n_comp = (variance_explained_ratio < variance_explained).sum()
75
76		if verbose:
77		print 'Selecting %s components with an explained variance of %s' % (n_comp, variance_explained_ratio[n_comp])
78
79		return Xn.dot(U[:, :n_comp])
80
81
82		if __name__ == '__main__':
83		run_clustering()