Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## author: Piotr Grabowski, 03.07.2016 for Kaggle
- import pandas as pd
- from sklearn.cluster import KMeans
- from sklearn import metrics
- import re
- from sklearn.preprocessing import Imputer
- from numpy import random
- import seaborn as sb
- ### Set path to the data set
- dataset_path = "77_cancer_proteomes_CPTAC_itraq.csv"
- clinical_info = "clinical_data_breast_cancer.csv"
- pam50_proteins = "PAM50_proteins.csv"
- ## Load data
- data = pd.read_csv(dataset_path,header=0,index_col=0)
- clinical = pd.read_csv(clinical_info,header=0,index_col=0)## holds clinical information about each patient/sample
- pam50 = pd.read_csv(pam50_proteins,header=0)
- ## Drop unused information columns
- data.drop(['gene_symbol','gene_name'],axis=1,inplace=True)
- ## Change the protein data sample names to a format matching the clinical data set
- data.rename(columns=lambda x: "TCGA-%s" % (re.split('[_|-|.]',x)[0]) if bool(re.search("TCGA",x)) is True else x,inplace=True)
- ## Transpose data for the clustering algorithm since we want to divide patient samples, not proteins
- data = data.transpose()
- ## Drop clinical entries for samples not in our protein data set
- clinical = clinical.loc[[x for x in clinical.index.tolist() if x in data.index],:]
- ## Add clinical meta data to our protein data set, note: all numerical features for analysis start with NP_ or XP_
- merged = data.merge(clinical,left_index=True,right_index=True)
- ## Change name to make it look nicer in the code!
- processed = merged
- ## Numerical data for the algorithm, NP_xx/XP_xx are protein identifiers from RefSeq database
- processed_numerical = processed.loc[:,[x for x in processed.columns if bool(re.search("NP_|XP_",x)) == True]]
- ## Select only the PAM50 proteins - known panel of genes used for breast cancer subtype prediction
- processed_numerical_p50 = processed_numerical.ix[:,processed_numerical.columns.isin(pam50['RefSeqProteinID'])]
- ## Impute missing values (maybe another method would work better?)
- imputer = Imputer(missing_values='NaN', strategy='median', axis=1)
- imputer = imputer.fit(processed_numerical_p50)
- processed_numerical_p50 = imputer.transform(processed_numerical_p50)
- ## Check which number of clusters works best, 20 and 79 are just for fun and comparison.
- n_clusters = [2,3,4,5,6,7,8,10,20,79]
- def compare_k_means(k_list,data):
- ## Run clustering with different k and check the metrics
- for k in k_list:
- clusterer = KMeans(n_clusters=k, n_jobs=4)
- clusterer.fit(data)
- ## The higher (up to 1) the better
- print("Silhouette Coefficient for k == %s: %s" % (
- k, round(metrics.silhouette_score(data, clusterer.labels_), 4)))
- ## The higher (up to 1) the better
- print("Homogeneity score for k == %s: %s" % (
- k, round(metrics.homogeneity_score(processed['PAM50 mRNA'], clusterer.labels_),4)))
- print("------------------------")
- ## What if we use a random set of 43 proteins? Will the clustering be as good?
- ## Create a random numerical matrix with imputation:
- processed_numerical_random = processed_numerical.iloc[:,random.choice(range(processed_numerical.shape[1]),43)]
- imputer_rnd = imputer.fit(processed_numerical_random)
- processed_numerical_random = imputer_rnd.transform(processed_numerical_random)
- ## Check different numbers of clusters for the PAM50 proteins, there are 4 subtypes of cancer in this data
- ## 3 samples of healthy patients were dropped at the beginning...
- compare_k_means(n_clusters,processed_numerical_p50)
- ## seems that k==3 works good, the silhouette score is still high and the homogeneity score jumps ~2-fold
- ## this is what they report in the paper to be the best number of clusters!
- ## k == 79 has homogeneity score of 1.0, no wonder since the algorithm can assign all the points their separate clusters!
- ## However, for our application, such clustering would be worthless.
- ## Use random proteins for comparison
- compare_k_means(n_clusters,processed_numerical_random)
- ## The scores should be significantly lower than for the PAM50 proteins!
- ## Visualize data using k==3, show the heatmap of protein expression for the used PAM50 proteins (43 available in our data)
- clusterer_final = KMeans(n_clusters=3, n_jobs=4)
- clusterer_final = clusterer_final.fit(processed_numerical_p50)
- processed_p50_plot = pd.DataFrame(processed_numerical_p50)
- processed_p50_plot['KMeans_cluster'] = clusterer_final.labels_
- processed_p50_plot.sort('KMeans_cluster',axis=0,inplace=True)
- ## Look at the heatmap of protein expression in all patients and look at their assigned cluster
- ## Proteins can either be more expressed (more is produced, less degraded), not changed or lower expressed than the used reference
- ## Since each protein has a distinct function in the cell, their levels describe the functional/signaling state the cell is in.
- processed_p50_plot.index.name = 'Patient'
- sb.heatmap(processed_p50_plot) ## The x-axis are the PAM50 proteins we used and the right-most column is the cluster marker
- ## Looks like the clustering works quite decently here!
- ## Each cluster means a different molecular signature for each patient. Such patients have different treatment options available
- ## to them!
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement