libdo

Untitled

Nov 6th, 2017
14,146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import click
  2.  
  3. from sklearn.cluster import KMeans
  4. from sklearn import metrics
  5.  
  6. from scipy.linalg import svd
  7.  
  8. import numpy as np
  9. import pandas as pd
  10.  
  11.  
  12. @click.command()
  13. @click.argument('input-file', help='Name of the input file. Should be stored as a Tab Separated File')
  14. @click.option('--output-file', default=None, help='Output results to a file instead of printing results')
  15. @click.option('--n-clusters', default=3, type=int, help='The number of KMeans clusters')
  16. @click.option('--pca', is_flag=True, help='Should we use PCA precprocessing of the data?')
  17. @click.option('--pca-min-variance', default=1.0, type=float,
  18.               help='The amount of variance to perserve if using PCA. Ignored if not used with --pca flag')
  19. @click.option('--evaluate-clusters', is_flag=True, help='Should we evaluate cluster quality after clustering?')
  20. @click.option('--verbose', is_flag=True, help='Print progress')
  21. def run_clustering(input_file, output_file, n_clusters, pca, pca_min_variance, evaluate_clusters, verbose):
  22.     """Run KMeans Clustering on an input TSV File with Optional PCA"""
  23.     input_data = pd.read_table(input_file)
  24.  
  25.     X = np.array(input_data)
  26.     if pca:
  27.         if verbose:
  28.             print 'Fitting PCA'
  29.  
  30.         X = pca(X, pca_min_variance, verbose)
  31.  
  32.     if verbose:
  33.         print 'Fitting K-Means with %s clusters' % n_clusters
  34.  
  35.     kmeans = KMeans(n_clusters=n_clusters).fit(X)
  36.  
  37.     clusters = kmeans.labels_
  38.  
  39.     if evaluate_clusters:
  40.         if verbose:
  41.             print 'Calculating Calinski-Harabaz Index'
  42.  
  43.         print 'Calinski-Harabaz Index: %s' % metrics.calinski_harabaz_score(X, clusters)
  44.  
  45.     if output_file is not None:
  46.         input_data['clusters'] = clusters
  47.         input_data.to_csv(output_file, index=False, columns='clusters', sep='\t')
  48.  
  49.     else:
  50.         print 'clusters'
  51.         for i in clusters:
  52.             print i
  53.  
  54.  
  55. def pca(X, variance_explained=1., verbose=True):
  56.     """Compute PCA on data matrix X"""
  57.     X = np.array(X)
  58.     m, n = X.shape
  59.     mu = X.mean(0)
  60.  
  61.     Xn = X - mu
  62.  
  63.     if verbose:
  64.         print 'Fitting covariance matrix'
  65.  
  66.     XtX = Xn.T.dot(Xn) / m
  67.  
  68.     if verbose:
  69.         print 'Computing SVD'
  70.  
  71.     U, s, V = svd(XtX)
  72.  
  73.     variance_explained_ratio = (s / s.sum()).cumsum()
  74.     n_comp = (variance_explained_ratio < variance_explained).sum()
  75.  
  76.     if verbose:
  77.         print 'Selecting %s components with an explained variance of %s' % (n_comp, variance_explained_ratio[n_comp])
  78.  
  79.     return Xn.dot(U[:, :n_comp])
  80.  
  81.  
  82. if __name__ == '__main__':
  83.     run_clustering()
Add Comment
Please, Sign In to add comment