Advertisement
misingnoglic

script.py

Dec 11th, 2016
141
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.53 KB | None | 0 0
  1. # activiate inline plotting
  2. %pylab inline
  3. # load modules
  4. import sys
  5. import scipy as SP
  6. import pylab as PL
  7. from matplotlib import cm
  8. import h5py
  9. import os
  10.  
  11. #adjust path
  12. scLVM_BASE = './..'
  13. from scLVM import scLVM
  14.  
  15. #sys.path.append(scLVM_BASE)
  16. #sys.path.append( scLVM_BASE +'..')
  17. #sys.path.append(scLVM_BASE + 'scLVM/utils') #this is not included in the github repo
  18. #sys.path.append(scLVM_BASE +'CFG')
  19. #from misc import *
  20. #from barplot import *
  21. #from default import *
  22. from scLVM.utils.barplot import *
  23. from scLVM.utils.misc import *
  24.  
  25. from IPython.display import Latex
  26.  
  27. data = os.path.join(scLVM_BASE,'data','Tcell','data_Tcells_normCounts.h5f')
  28. f = h5py.File(data,'r')
  29. Y = f['LogNcountsMmus'][:]                 # gene expression matrix
  30. tech_noise = f['LogVar_techMmus'][:]       # technical noise
  31. genes_het_bool=f['genes_heterogen'][:]     # index of heterogeneous genes
  32. geneID = f['gene_names'][:]            # gene names
  33. cellcyclegenes_filter = SP.unique(f['cellcyclegenes_filter'][:].ravel() -1) # idx of cell cycle genes from GO
  34. cellcyclegenes_filterCB = f['ccCBall_gene_indices'][:].ravel() -1        # idx of cell cycle genes from cycle base ...
  35.  
  36.  
  37. # filter cell cycle genes
  38. idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB)
  39. # determine non-zero counts
  40. idx_nonzero = SP.nonzero((Y.mean(0)**2)>0)[0]
  41. idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,idx_nonzero)
  42. # subset gene expression matrix
  43. Ycc = Y[:,idx_cell_cycle_noise_filtered]
  44.  
  45. plt = PL.subplot(1,1,1)
  46. PL.imshow(Ycc,cmap=cm.RdBu,vmin=-3,vmax=+3,interpolation='None')
  47. #PL.colorbar()
  48. plt.set_xticks([])
  49. plt.set_yticks([])
  50. PL.xlabel('genes')
  51. PL.ylabel('cells')
  52.  
  53.  
  54. k = 80                    # number of latent factors
  55. out_dir = scLVM_BASE + 'cache'       # folder where results are cached
  56. file_name = 'Kcc.hdf5'    # name of the cache file
  57. recalc = True             # recalculate X and Kconf
  58. use_ard = True            # use automatic relevance detection
  59. sclvm = scLVM(Y)
  60. #Fit model with 80 factors
  61. X_ARD,Kcc_ARD,varGPLVM_ARD = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=k,out_dir=out_dir,file_name=file_name,recalc=recalc, use_ard=use_ard)
  62.  
  63. #Plot variance contributions from ARD
  64. plt = PL.subplot(1,1,1)
  65. PL.title('Variance explained by latent factors')
  66. PL.scatter(SP.arange(k)+1,varGPLVM_ARD['X_ARD'])
  67. PL.xlim([0,k+1])
  68. PL.xlabel('# Factor')
  69. PL.ylabel('Variance explained')
  70.  
  71. #Fit model with a single factor (rank 1 covariance matrix)
  72. X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=True, use_ard=False)
  73.  
  74. #Plot inferred similarity matrix
  75. plt = PL.subplot(1,1,1)
  76. PL.title('Similarity matrix based on cell cycle')
  77. PL.imshow(Kcc,cmap=cm.RdBu,vmin=-3,vmax=+3,interpolation='None')
  78. PL.colorbar()
  79. plt.set_xticks([])
  80. plt.set_yticks([])
  81. PL.xlabel('cells')
  82. PL.ylabel('cells')
  83.  
  84. # considers only heterogeneous genes
  85. Ihet = genes_het_bool==1
  86. Y    = Y[:,Ihet]
  87. tech_noise = tech_noise[Ihet]
  88. geneID = geneID[Ihet]
  89.  
  90. #optionally: restrict range for the analysis
  91. i0 = 0    # gene from which the analysis starts
  92. i1 = 2000 # gene at which the analysis ends
  93.  
  94. # construct sclvm object
  95. sclvm = scLVM(Y,geneID=geneID,tech_noise=tech_noise)
  96.  
  97. # fit the model from i0 to i1
  98. sclvm.varianceDecomposition(K=Kcc,i0=i0,i1=i1)
  99.  
  100. normalize=True    # variance components are normalizaed to sum up to one
  101.  
  102. # get variance components
  103. var, var_info = sclvm.getVarianceComponents(normalize=normalize)
  104. var_filtered = var[var_info['conv']] # filter out genes for which vd has not converged
  105.  
  106. # get corrected expression levels
  107. Ycorr = sclvm.getCorrectedExpression()
  108. Ycorr.shape
  109.  
  110. #calculate average variance components across all genes and visualize
  111. var_mean = var_filtered.mean(0)
  112. colors = ['Green','MediumBlue','Gray']
  113. pp=PL.pie(var_mean,labels=var_info['col_header'],autopct='%1.1f%%',colors=colors,
  114.        shadow=True, startangle=0)
  115.  
  116. H2=1-var_filtered[:,2]
  117. var_comp_fileds = SP.array([[0, 'cell cycle', 'Peru'],
  118.        [1, 'biol. var', 'DarkMagenta'],
  119.        [2, 'tech. var', '#92c5de']], dtype=object)
  120. var_plot(var_filtered,H2,var_comp_fileds,normalize=True, figsize=[5,4])
  121.  
  122. i0 = 0     # gene from which the analysis starts
  123. i1 = 10    # gene to which the analysis ends
  124.  
  125. # fit lmm without correction
  126. pv0,beta0,info0 = sclvm.fitLMM(K=None,i0=i0,i1=i1,verbose=False)
  127. # fit lmm with correction
  128. pv1,beta1,info1 = sclvm.fitLMM(K=Kcc,i0=i0,i1=i1,verbose=False)
  129.  
  130. plt=PL.subplot(2,2,1)
  131. PL.title('Without Correction')
  132. p=PL.imshow(beta0[:,i0:i1],cmap=cm.RdBu,vmin=-0.6,vmax=+1,interpolation='None')
  133. PL.colorbar()
  134. plt.set_xticks([])
  135. plt.set_yticks([])
  136. PL.xlabel('gene'),PL.ylabel('gene')
  137. plt=PL.subplot(2,2,2)
  138. PL.title('With Correction')
  139. p=PL.imshow(beta1[:,i0:i1],cmap=cm.RdBu,vmin=-0.6,vmax=+1,interpolation='None')
  140. PL.colorbar()
  141. plt.set_xticks([])
  142. plt.set_yticks([])
  143. PL.xlabel('gene'),PL.ylabel('gene')
  144.  
  145. SP.savetxt('Ycorr.txt',Ycorr)
  146.  
  147. import GPy
  148.  
  149. # Model optimization
  150. Ystd = Ycorr-Ycorr.mean(0)
  151. Ystd/=Ystd.std(0)
  152. input_dim = 2 # How many latent dimensions to use
  153. kern = GPy.kern.RBF(input_dim,ARD=True) # ARD kernel
  154. m = GPy.models.BayesianGPLVM(Ystd, input_dim=input_dim, kernel=kern, num_inducing=40)
  155. m.optimize('scg', messages=0, max_iters=2000)
  156.  
  157. m.kern.plot_ARD()
  158.  
  159. i_Gata3 = SP.where(geneID=='ENSMUSG00000015619')
  160. color = Ycorr[:,i_Gata3]
  161. #color = Ycorr[:,0]
  162. PL.scatter(m.X[:,0]['mean'], m.X[:,1]['mean'], 40, color)
  163. PL.xlabel('PC1')
  164. PL.ylabel('PC2')
  165. PL.colorbar()
  166.  
  167. [S,W] = PCA(Ystd,2)
  168.  
  169.  
  170. PL.scatter(S[:,0],S[:,1], 40, color)
  171. PL.xlabel('PC1')
  172. PL.ylabel('PC2')
  173. PL.colorbar()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement