Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # activiate inline plotting
- %pylab inline
- # load modules
- import sys
- import scipy as SP
- import pylab as PL
- from matplotlib import cm
- import h5py
- import os
- #adjust path
- scLVM_BASE = './..'
- from scLVM import scLVM
- #sys.path.append(scLVM_BASE)
- #sys.path.append( scLVM_BASE +'..')
- #sys.path.append(scLVM_BASE + 'scLVM/utils') #this is not included in the github repo
- #sys.path.append(scLVM_BASE +'CFG')
- #from misc import *
- #from barplot import *
- #from default import *
- from scLVM.utils.barplot import *
- from scLVM.utils.misc import *
- from IPython.display import Latex
- data = os.path.join(scLVM_BASE,'data','Tcell','data_Tcells_normCounts.h5f')
- f = h5py.File(data,'r')
- Y = f['LogNcountsMmus'][:] # gene expression matrix
- tech_noise = f['LogVar_techMmus'][:] # technical noise
- genes_het_bool=f['genes_heterogen'][:] # index of heterogeneous genes
- geneID = f['gene_names'][:] # gene names
- cellcyclegenes_filter = SP.unique(f['cellcyclegenes_filter'][:].ravel() -1) # idx of cell cycle genes from GO
- cellcyclegenes_filterCB = f['ccCBall_gene_indices'][:].ravel() -1 # idx of cell cycle genes from cycle base ...
- # filter cell cycle genes
- idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB)
- # determine non-zero counts
- idx_nonzero = SP.nonzero((Y.mean(0)**2)>0)[0]
- idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,idx_nonzero)
- # subset gene expression matrix
- Ycc = Y[:,idx_cell_cycle_noise_filtered]
- plt = PL.subplot(1,1,1)
- PL.imshow(Ycc,cmap=cm.RdBu,vmin=-3,vmax=+3,interpolation='None')
- #PL.colorbar()
- plt.set_xticks([])
- plt.set_yticks([])
- PL.xlabel('genes')
- PL.ylabel('cells')
- k = 80 # number of latent factors
- out_dir = scLVM_BASE + 'cache' # folder where results are cached
- file_name = 'Kcc.hdf5' # name of the cache file
- recalc = True # recalculate X and Kconf
- use_ard = True # use automatic relevance detection
- sclvm = scLVM(Y)
- #Fit model with 80 factors
- X_ARD,Kcc_ARD,varGPLVM_ARD = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=k,out_dir=out_dir,file_name=file_name,recalc=recalc, use_ard=use_ard)
- #Plot variance contributions from ARD
- plt = PL.subplot(1,1,1)
- PL.title('Variance explained by latent factors')
- PL.scatter(SP.arange(k)+1,varGPLVM_ARD['X_ARD'])
- PL.xlim([0,k+1])
- PL.xlabel('# Factor')
- PL.ylabel('Variance explained')
- #Fit model with a single factor (rank 1 covariance matrix)
- X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=True, use_ard=False)
- #Plot inferred similarity matrix
- plt = PL.subplot(1,1,1)
- PL.title('Similarity matrix based on cell cycle')
- PL.imshow(Kcc,cmap=cm.RdBu,vmin=-3,vmax=+3,interpolation='None')
- PL.colorbar()
- plt.set_xticks([])
- plt.set_yticks([])
- PL.xlabel('cells')
- PL.ylabel('cells')
- # considers only heterogeneous genes
- Ihet = genes_het_bool==1
- Y = Y[:,Ihet]
- tech_noise = tech_noise[Ihet]
- geneID = geneID[Ihet]
- #optionally: restrict range for the analysis
- i0 = 0 # gene from which the analysis starts
- i1 = 2000 # gene at which the analysis ends
- # construct sclvm object
- sclvm = scLVM(Y,geneID=geneID,tech_noise=tech_noise)
- # fit the model from i0 to i1
- sclvm.varianceDecomposition(K=Kcc,i0=i0,i1=i1)
- normalize=True # variance components are normalizaed to sum up to one
- # get variance components
- var, var_info = sclvm.getVarianceComponents(normalize=normalize)
- var_filtered = var[var_info['conv']] # filter out genes for which vd has not converged
- # get corrected expression levels
- Ycorr = sclvm.getCorrectedExpression()
- Ycorr.shape
- #calculate average variance components across all genes and visualize
- var_mean = var_filtered.mean(0)
- colors = ['Green','MediumBlue','Gray']
- pp=PL.pie(var_mean,labels=var_info['col_header'],autopct='%1.1f%%',colors=colors,
- shadow=True, startangle=0)
- H2=1-var_filtered[:,2]
- var_comp_fileds = SP.array([[0, 'cell cycle', 'Peru'],
- [1, 'biol. var', 'DarkMagenta'],
- [2, 'tech. var', '#92c5de']], dtype=object)
- var_plot(var_filtered,H2,var_comp_fileds,normalize=True, figsize=[5,4])
- i0 = 0 # gene from which the analysis starts
- i1 = 10 # gene to which the analysis ends
- # fit lmm without correction
- pv0,beta0,info0 = sclvm.fitLMM(K=None,i0=i0,i1=i1,verbose=False)
- # fit lmm with correction
- pv1,beta1,info1 = sclvm.fitLMM(K=Kcc,i0=i0,i1=i1,verbose=False)
- plt=PL.subplot(2,2,1)
- PL.title('Without Correction')
- p=PL.imshow(beta0[:,i0:i1],cmap=cm.RdBu,vmin=-0.6,vmax=+1,interpolation='None')
- PL.colorbar()
- plt.set_xticks([])
- plt.set_yticks([])
- PL.xlabel('gene'),PL.ylabel('gene')
- plt=PL.subplot(2,2,2)
- PL.title('With Correction')
- p=PL.imshow(beta1[:,i0:i1],cmap=cm.RdBu,vmin=-0.6,vmax=+1,interpolation='None')
- PL.colorbar()
- plt.set_xticks([])
- plt.set_yticks([])
- PL.xlabel('gene'),PL.ylabel('gene')
- SP.savetxt('Ycorr.txt',Ycorr)
- import GPy
- # Model optimization
- Ystd = Ycorr-Ycorr.mean(0)
- Ystd/=Ystd.std(0)
- input_dim = 2 # How many latent dimensions to use
- kern = GPy.kern.RBF(input_dim,ARD=True) # ARD kernel
- m = GPy.models.BayesianGPLVM(Ystd, input_dim=input_dim, kernel=kern, num_inducing=40)
- m.optimize('scg', messages=0, max_iters=2000)
- m.kern.plot_ARD()
- i_Gata3 = SP.where(geneID=='ENSMUSG00000015619')
- color = Ycorr[:,i_Gata3]
- #color = Ycorr[:,0]
- PL.scatter(m.X[:,0]['mean'], m.X[:,1]['mean'], 40, color)
- PL.xlabel('PC1')
- PL.ylabel('PC2')
- PL.colorbar()
- [S,W] = PCA(Ystd,2)
- PL.scatter(S[:,0],S[:,1], 40, color)
- PL.xlabel('PC1')
- PL.ylabel('PC2')
- PL.colorbar()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement