Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # compare components from the same gmm
- import numpy as np
- from multiprocessing import Pool
- def introspection(pop, sample=None, comp1=0, comp2=1, nbins=20, cutoff=.5, renderhists=False, usefiltered=True):
- gmm = pop['samples'][sample]['gmm']
- C = pop['samples'][sample]['C']
- prediction = gmm.predict(C)
- if usefiltered == True:
- M = pop['samples'][sample]['M_norm']
- genes = np.array(pop['filtered_genes'])
- elif usefiltered == False:
- M = pop['samples'][sample]['M']
- genes = np.array(pop['genes'])
- idx1 = np.where(prediction==comp1)[0]
- idx2 = np.where(prediction==comp2)[0]
- sub1 = M[:,idx1].toarray()
- sub2 = M[:,idx2].toarray()
- with Pool(pop['ncores']) as p:
- q = np.array(p.starmap(PA.l1norm, [(ig, sub1[ig,:], sub2[ig,:], nbins) for ig in range(sub1.shape[0])]))
- idx = np.argsort(q)
- q = q[idx]
- genes = genes[idx]
- sub1 = sub1[idx,:]
- sub2 = sub2[idx,:]
- # render l1norm values
- samplename = sample.replace('/','') # remove slash char to not mess up the folder path
- dname = 'diffexp/%s_%d_%d/' % (samplename, comp1, comp2) # define directory name
- PA.mkdir(os.path.join(pop['output'], dname)) # create directory if needed
- x = np.arange(len(q))
- y = q
- plt.scatter(x, y, s=.1, alpha=1)
- plt.axhline(y=cutoff, color='red', linewidth=.5, label='Cutoff')
- plt.axhline(y=-cutoff, color='red', linewidth=.5)
- plt.xticks([])
- plt.ylabel('l1-norm')
- plt.xlabel('Genes')
- plt.legend()
- filename = 'l1norm_values'
- filename = os.path.join(pop['output'], dname, '%s.png' % filename)
- plt.savefig(filename, dpi=200, bbox_inches='tight')
- plt.close()
- downregulated_idx = np.where(np.array(q)<-cutoff)[0] # get indices of genes with low l1-norm values
- upregulated_idx = np.where(np.array(q)>cutoff)[0] # get indices of genes with high l1-norm values
- downregulated = [genes[i] for i in downregulated_idx] # get gene labels
- upregulated = [genes[i] for i in upregulated_idx] # get gene labels
- if len(downregulated+upregulated) == 0:
- raise Exception('Cutoff value did not retrieve any gene. Please modify cutoff based on %s' % filename)
- # gsea
- currpath = '/anaconda3/lib/python3.7/site-packages/popalign/' # get current path of this file to find the genesets
- ''
- geneset = 'c5bp' # name of the geneset file
- d = PA.load_dict(os.path.join(currpath, "gsea/%s.npy" % geneset)) # load geneset dictionar
- ngenesets = 20
- dr_genesets = PA.enrichment_analysis(pop, d, downregulated, len(pop['genes']), ngenesets) # find genesets pvalues for the list of downregulated genes
- ur_genesets = PA.enrichment_analysis(pop, d, upregulated, len(pop['genes']), ngenesets) # find genesets pvalues for the list of upregulated genes
- lidx = np.concatenate([downregulated_idx,upregulated_idx])
- labels = ['downregulated']*len(downregulated_idx)+['upregulated']*len(upregulated_idx)
- with open(os.path.join(pop['output'], dname, 'downregulated_genes.txt'),'w') as fout:
- fout.write('Downregulated genes for sample %s relative to the reference sample\n\n' % sample)
- fout.write('\n'.join(downregulated)) # save list of downregulated gene names
- fout.write('\n\nMatching genesets:\n\n')
- fout.write('\n'.join(dr_genesets))
- with open(os.path.join(pop['output'], dname, 'upregulated_genes.txt'),'w') as fout:
- fout.write('Upregulated genes for sample: %s relative to the reference sample\n\n' % sample)
- fout.write('\n'.join(upregulated)) # save list of upregulated gene names
- fout.write('\n\nMatching genesets:\n\n')
- fout.write('\n'.join(ur_genesets))
- if renderhists == True: # if variable is True, then start histogram rendering
- dname = 'diffexp/%d_%s/hists/' % (refcomp, samplename) # define directory name
- try:
- shutil.rmtree(os.path.join(pop['output'], dname))
- except:
- pass
- mkdir(os.path.join(pop['output'], dname)) # create directory if needed
- for lbl,i in zip(labels, lidx): # for each gene index in final list
- gname = genes[i]
- arrref = subref[i,:]
- arrtest = subtest[i,:]
- maxref, maxtest = np.max(arrref), np.max(arrtest)
- max_ = max(maxref,maxtest)
- nbins = 20
- bref, beref = np.histogram(arrref, bins=nbins, range=(0,max_))
- btest, betest = np.histogram(arrtest, bins=nbins, range=(0,max_))
- bref = bref/len(arrref)
- btest = btest/len(arrtest)
- width = beref[-1]/nbins
- plt.bar(beref[:-1], bref, label=xref, alpha=.3, width=width)
- plt.bar(beref[:-1], btest, label=xtest, alpha=0.3, width=width)
- plt.legend()
- plt.xlabel('Normalized counts')
- plt.ylabel('Percentage of cells in subpopulation')
- plt.title('Gene %s\nSubpopulation #%d of %s' % (gname, refcomp, xref))
- filename = '%s_%s' % (lbl, gname)
- plt.savefig(os.path.join(pop['output'], dname, '%s.png' % filename), dpi=200, bbox_inches='tight')
- plt.close()
- lidx = [genes[i] for i in lidx]
- #PA.plot_heatmap(pop, refcomp, lidx, clustersamples=False, clustercells=True, savename='%d_%s_only' % (refcomp, sample), figsize=(15,15), cmap='Purples', samplelimits=False, scalegenes=True, only=sample, equalncells=True)
- return lidx
- sample = 'H-DER-2'
- comp1 = 2
- comp2 = 3
- genelist = introspection(pop, sample=sample, comp1=comp1, comp2=comp2, nbins=20, cutoff=1, renderhists=False, usefiltered=False)
- importlib.reload(PA)
- PA.plot_genes_gmm_cells(pop,
- sample=sample,
- genelist=genelist,
- savename='%s_%d_%d' % (sample,comp1,comp2),
- metric='correlation',
- method='single',
- clustergenes=False,
- clustercells=False,
- cmap='magma',
- figsize=(20,20)
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement