View difference between Paste ID: uyFKjQLG and R9s9XPdy
SHOW: | | - or go back to the newest paste.
1
import click
2
3
from sklearn.cluster import KMeans
4
from sklearn import metrics
5
6
from scipy.linalg import svd
7
8
import numpy as np
9
import pandas as pd
10
11
12
@click.command()
13
@click.argument('input-file', help='Name of the input file. Should be stored as a Tab Separated File')
14
@click.option('--output-file', default=None, help='Output results to a file instead of printing results')
15
@click.option('--n-clusters', default=3, type=int, help='The number of KMeans clusters')
16
@click.option('--pca', is_flag=True, help='Should we use PCA precprocessing of the data?')
17
@click.option('--pca-min-variance', default=1.0, type=float,
18
              help='The amount of variance to perserve if using PCA. Ignored if not used with --pca flag')
19
@click.option('--evaluate-clusters', is_flag=True, help='Should we evaluate cluster quality after clustering?')
20
@click.option('--verbose', is_flag=True, help='Print progress')
21
def run_clustering(input_file, output_file, n_clusters, pca, pca_min_variance, evaluate_clusters, verbose):
22
    """Run KMeans Clustering on an input TSV File with Optional PCA"""
23
    input_data = pd.read_table(input_file)
24
25
    X = np.array(input_data)
26
    if pca:
27
        if verbose:
28
            print 'Fitting PCA'
29
30
        X = pca(X, pca_min_variance, verbose)
31
32
    if verbose:
33
        print 'Fitting K-Means with %s clusters' % n_clusters
34
35
    kmeans = KMeans(n_clusters=n_clusters).fit(X)
36
37
    clusters = kmeans.labels_
38
39
    if evaluate_clusters:
40
        if verbose:
41
            print 'Calculating Calinski-Harabaz Index'
42
43
        print 'Calinski-Harabaz Index: %s' % metrics.calinski_harabaz_score(X, clusters)
44
45
    if output_file is not None:
46
        input_data['clusters'] = clusters
47
        input_data.to_csv(output_file, index=False, columns='clusters', sep='\t')
48
49
    else:
50
        print 'clusters'
51
        for i in clusters:
52
            print i
53
54
55
def pca(X, variance_explained=1., verbose=True):
56
    """Compute PCA on data matrix X"""
57
    X = np.array(X)
58
    m, n = X.shape
59
    mu = X.mean(0)
60
61
    Xn = X - mu
62
63
    if verbose:
64
        print 'Fitting covariance matrix'
65
66
    XtX = Xn.T.dot(Xn) / m
67
68
    if verbose:
69
        print 'Computing SVD'
70
71
    U, s, V = svd(XtX)
72
73
    variance_explained_ratio = (s / s.sum()).cumsum()
74
    n_comp = (variance_explained_ratio < variance_explained).sum()
75
76
    if verbose:
77
        print 'Selecting %s components with an explained variance of %s' % (n_comp, variance_explained_ratio[n_comp])
78
79
    return Xn.dot(U[:, :n_comp])
80
81
82
if __name__ == '__main__':
83
    run_clustering()