Advertisement
Guest User

kaggle_breast_cancer_proteomes

a guest
Jul 3rd, 2016
5,264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.12 KB | None | 0 0
  1. ## author: Piotr Grabowski, 03.07.2016 for Kaggle
  2. import pandas as pd
  3. from sklearn.cluster import KMeans
  4. from sklearn import metrics
  5. import re
  6. from sklearn.preprocessing import Imputer
  7. from numpy import random
  8. import seaborn as sb
  9.  
  10. ### Set path to the data set
  11. dataset_path = "77_cancer_proteomes_CPTAC_itraq.csv"
  12. clinical_info = "clinical_data_breast_cancer.csv"
  13. pam50_proteins = "PAM50_proteins.csv"
  14.  
  15. ## Load data
  16. data = pd.read_csv(dataset_path,header=0,index_col=0)
  17. clinical = pd.read_csv(clinical_info,header=0,index_col=0)## holds clinical information about each patient/sample
  18. pam50 = pd.read_csv(pam50_proteins,header=0)
  19.  
  20. ## Drop unused information columns
  21. data.drop(['gene_symbol','gene_name'],axis=1,inplace=True)
  22.  
  23.  
  24. ## Change the protein data sample names to a format matching the clinical data set
  25. data.rename(columns=lambda x: "TCGA-%s" % (re.split('[_|-|.]',x)[0]) if bool(re.search("TCGA",x)) is True else x,inplace=True)
  26.  
  27. ## Transpose data for the clustering algorithm since we want to divide patient samples, not proteins
  28. data = data.transpose()
  29.  
  30. ## Drop clinical entries for samples not in our protein data set
  31. clinical = clinical.loc[[x for x in clinical.index.tolist() if x in data.index],:]
  32.  
  33. ## Add clinical meta data to our protein data set, note: all numerical features for analysis start with NP_ or XP_
  34. merged = data.merge(clinical,left_index=True,right_index=True)
  35.  
  36. ## Change name to make it look nicer in the code!
  37. processed = merged
  38.  
  39. ## Numerical data for the algorithm, NP_xx/XP_xx are protein identifiers from RefSeq database
  40. processed_numerical = processed.loc[:,[x for x in processed.columns if bool(re.search("NP_|XP_",x)) == True]]
  41.  
  42. ## Select only the PAM50 proteins - known panel of genes used for breast cancer subtype prediction
  43. processed_numerical_p50 = processed_numerical.ix[:,processed_numerical.columns.isin(pam50['RefSeqProteinID'])]
  44.  
  45. ## Impute missing values (maybe another method would work better?)
  46. imputer = Imputer(missing_values='NaN', strategy='median', axis=1)
  47. imputer = imputer.fit(processed_numerical_p50)
  48. processed_numerical_p50 = imputer.transform(processed_numerical_p50)
  49.  
  50. ## Check which number of clusters works best, 20 and 79 are just for fun and comparison.
  51. n_clusters = [2,3,4,5,6,7,8,10,20,79]
  52.  
  53. def compare_k_means(k_list,data):
  54.     ## Run clustering with different k and check the metrics
  55.     for k in k_list:
  56.         clusterer = KMeans(n_clusters=k, n_jobs=4)
  57.         clusterer.fit(data)
  58.         ## The higher (up to 1) the better
  59.         print("Silhouette Coefficient for k == %s: %s" % (
  60.         k, round(metrics.silhouette_score(data, clusterer.labels_), 4)))
  61.         ## The higher (up to 1) the better
  62.         print("Homogeneity score for k == %s: %s" % (
  63.         k, round(metrics.homogeneity_score(processed['PAM50 mRNA'], clusterer.labels_),4)))
  64.         print("------------------------")
  65.  
  66. ## What if we use a random set of 43 proteins? Will the clustering be as good?
  67. ## Create a random numerical matrix with imputation:
  68. processed_numerical_random = processed_numerical.iloc[:,random.choice(range(processed_numerical.shape[1]),43)]
  69. imputer_rnd = imputer.fit(processed_numerical_random)
  70. processed_numerical_random = imputer_rnd.transform(processed_numerical_random)
  71.  
  72.  
  73. ## Check different numbers of clusters for the PAM50 proteins, there are 4 subtypes of cancer in this data
  74. ## 3 samples of healthy patients were dropped at the beginning...
  75. compare_k_means(n_clusters,processed_numerical_p50)
  76. ## seems that k==3 works good, the silhouette score is still high and the homogeneity score jumps ~2-fold
  77. ## this is what they report in the paper to be the best number of clusters!
  78. ## k == 79 has homogeneity score of 1.0, no wonder since the algorithm can assign all the points their separate clusters!
  79. ## However, for our application, such clustering would be worthless.
  80.  
  81. ## Use random proteins for comparison
  82. compare_k_means(n_clusters,processed_numerical_random)
  83. ## The scores should be significantly lower than for the PAM50 proteins!
  84.  
  85.  
  86. ## Visualize data using k==3, show the heatmap of protein expression for the used PAM50 proteins (43 available in our data)
  87. clusterer_final = KMeans(n_clusters=3, n_jobs=4)
  88. clusterer_final = clusterer_final.fit(processed_numerical_p50)
  89. processed_p50_plot = pd.DataFrame(processed_numerical_p50)
  90. processed_p50_plot['KMeans_cluster'] = clusterer_final.labels_
  91. processed_p50_plot.sort('KMeans_cluster',axis=0,inplace=True)
  92.  
  93. ## Look at the heatmap of protein expression in all patients and look at their assigned cluster
  94. ## Proteins can either be more expressed (more is produced, less degraded), not changed or lower expressed than the used reference
  95. ## Since each protein has a distinct function in the cell, their levels describe the functional/signaling state the cell is in.
  96. processed_p50_plot.index.name = 'Patient'
  97. sb.heatmap(processed_p50_plot) ## The x-axis are the PAM50 proteins we used and the right-most column is the cluster marker
  98. ## Looks like the clustering works quite decently here!
  99.  
  100. ## Each cluster means a different molecular signature for each patient. Such patients have different treatment options available
  101. ## to them!
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement