Untitled

# compare components from the same gmm

import numpy as np
from multiprocessing import Pool

def introspection(pop, sample=None, comp1=0, comp2=1, nbins=20, cutoff=.5, renderhists=False, usefiltered=True):
    gmm = pop['samples'][sample]['gmm']
    C = pop['samples'][sample]['C']
    prediction = gmm.predict(C)

    if usefiltered == True:
        M = pop['samples'][sample]['M_norm']
        genes = np.array(pop['filtered_genes'])
    elif usefiltered == False:
        M = pop['samples'][sample]['M']
        genes = np.array(pop['genes'])

    idx1 = np.where(prediction==comp1)[0]
    idx2 = np.where(prediction==comp2)[0]
    sub1 = M[:,idx1].toarray()
    sub2 = M[:,idx2].toarray()

    with Pool(pop['ncores']) as p:
        q = np.array(p.starmap(PA.l1norm, [(ig, sub1[ig,:], sub2[ig,:], nbins) for ig in range(sub1.shape[0])]))

    idx = np.argsort(q)
    q = q[idx]
    genes = genes[idx]
    sub1 = sub1[idx,:]
    sub2 = sub2[idx,:]

    # render l1norm values
    samplename = sample.replace('/','') # remove slash char to not mess up the folder path
    dname = 'diffexp/%s_%d_%d/' % (samplename, comp1, comp2) # define directory name
    PA.mkdir(os.path.join(pop['output'], dname)) # create directory if needed
    x = np.arange(len(q))
    y = q
    plt.scatter(x, y, s=.1, alpha=1)
    plt.axhline(y=cutoff, color='red', linewidth=.5, label='Cutoff')
    plt.axhline(y=-cutoff, color='red', linewidth=.5)
    plt.xticks([])
    plt.ylabel('l1-norm')
    plt.xlabel('Genes')
    plt.legend()
    filename = 'l1norm_values'
    filename = os.path.join(pop['output'], dname, '%s.png' % filename)
    plt.savefig(filename, dpi=200, bbox_inches='tight')
    plt.close()

    downregulated_idx = np.where(np.array(q)<-cutoff)[0] # get indices of genes with low l1-norm values
    upregulated_idx = np.where(np.array(q)>cutoff)[0] # get indices of genes with high l1-norm values
    downregulated = [genes[i] for i in downregulated_idx] # get gene labels
    upregulated = [genes[i] for i in upregulated_idx] # get gene labels
    if len(downregulated+upregulated) == 0:
        raise Exception('Cutoff value did not retrieve any gene. Please modify cutoff based on %s' % filename)

    # gsea
    currpath = '/anaconda3/lib/python3.7/site-packages/popalign/' # get current path of this file to find the genesets
    ''
    geneset = 'c5bp' # name of the geneset file
    d = PA.load_dict(os.path.join(currpath, "gsea/%s.npy" % geneset)) # load geneset dictionar
    ngenesets = 20

    dr_genesets = PA.enrichment_analysis(pop, d, downregulated, len(pop['genes']), ngenesets) # find genesets pvalues for the list of downregulated genes
    ur_genesets = PA.enrichment_analysis(pop, d, upregulated, len(pop['genes']), ngenesets) # find genesets pvalues for the list of upregulated genes

    lidx = np.concatenate([downregulated_idx,upregulated_idx])
    labels = ['downregulated']*len(downregulated_idx)+['upregulated']*len(upregulated_idx)

    with open(os.path.join(pop['output'], dname, 'downregulated_genes.txt'),'w') as fout:
        fout.write('Downregulated genes for sample %s relative to the reference sample\n\n' % sample)
        fout.write('\n'.join(downregulated)) # save list of downregulated gene names
        fout.write('\n\nMatching genesets:\n\n')
        fout.write('\n'.join(dr_genesets))
    with open(os.path.join(pop['output'], dname, 'upregulated_genes.txt'),'w') as fout:
        fout.write('Upregulated genes for sample: %s relative to the reference sample\n\n' % sample)
        fout.write('\n'.join(upregulated)) # save list of upregulated gene names
        fout.write('\n\nMatching genesets:\n\n')
        fout.write('\n'.join(ur_genesets))

    if renderhists == True: # if variable is True, then start histogram rendering
        dname = 'diffexp/%d_%s/hists/' % (refcomp, samplename) # define directory name
        try:
            shutil.rmtree(os.path.join(pop['output'], dname))
        except:
            pass
        mkdir(os.path.join(pop['output'], dname)) # create directory if needed
        for lbl,i in zip(labels, lidx): # for each gene index in final list
            gname = genes[i]

            arrref = subref[i,:]
            arrtest = subtest[i,:]
            maxref, maxtest = np.max(arrref), np.max(arrtest)
            max_ = max(maxref,maxtest)

            nbins = 20
            bref, beref = np.histogram(arrref, bins=nbins, range=(0,max_))
            btest, betest = np.histogram(arrtest, bins=nbins, range=(0,max_))
            bref = bref/len(arrref)
            btest = btest/len(arrtest)

            width = beref[-1]/nbins
            plt.bar(beref[:-1], bref, label=xref, alpha=.3, width=width)
            plt.bar(beref[:-1], btest, label=xtest, alpha=0.3, width=width)
            plt.legend()
            plt.xlabel('Normalized counts')
            plt.ylabel('Percentage of cells in subpopulation')
            plt.title('Gene %s\nSubpopulation #%d of %s' % (gname, refcomp, xref))

            filename = '%s_%s' % (lbl, gname)
            plt.savefig(os.path.join(pop['output'], dname, '%s.png' % filename), dpi=200, bbox_inches='tight')
            plt.close()

    lidx = [genes[i] for i in lidx]
    #PA.plot_heatmap(pop, refcomp, lidx, clustersamples=False, clustercells=True, savename='%d_%s_only' % (refcomp, sample), figsize=(15,15), cmap='Purples', samplelimits=False, scalegenes=True, only=sample, equalncells=True)
    return lidx

sample = 'H-DER-2'
comp1 = 2
comp2 = 3
genelist = introspection(pop, sample=sample, comp1=comp1, comp2=comp2, nbins=20, cutoff=1, renderhists=False, usefiltered=False)

importlib.reload(PA)
PA.plot_genes_gmm_cells(pop,
               sample=sample,
               genelist=genelist,
               savename='%s_%d_%d' % (sample,comp1,comp2),
               metric='correlation',
               method='single',
               clustergenes=False,
               clustercells=False,
               cmap='magma',
               figsize=(20,20)
)