Untitled

# Not tested pseudocode

import subprocess
import h5py

def call_and_check_errors(command):

    proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                            shell=True, executable='/bin/bash')
    (stdout, stderr) = proc.communicate()
    print("Check stdout: {}".format(stdout))
    if stderr:
        print("Stderr is not empty. Might be an error in call_and_check_errors for the command: {}".format(command))
        print("Check stderr: {}".format(stderr))
        return stderr   # Error, very bad!
    else:
        return 0        # No error, great!

def hdf2cool(infile, outfile, chrms_sizes, assembly='dm3', correct=True):
    """
    This function converts hiclib .hdf5 to .cool file.
    Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.

    :param infile: input .hdf5 file
    :param outfile: output .cool file
    :param chrms_sizes: tab-separated file with chromosome lengths
    :param assembly: genome assembly (dm3 by default)
    :param correct: iteratively correct the heatmap? (True by default)

    :return: Python cooler object which was written to the file.
    """

    a = h5py.File(infile, 'r')
    heatmap = a['heatmap'].value

    chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(a['genomeIdxToLabel'].value).values() ))

    chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
    binsize = pickle.loads(a['resolution'].value)
    bins    = cooler.binnify(chromsizes, binsize)

    iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
    cooler.io.create(outfile, bins, iterator, assembly=assembly)

    c = cooler.Cooler(outfile)
    if correct:
        bias, stats = cooler.ice.iterative_correction(c, store=c, max_iters=1000)

    return c

def hdf2cool_bynum(infile, outfile, chrms_sizes, assembly='dm3', correct=True, factor=1):
    """
    This function converts hiclib .hdf5 to .cool file.
    Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.

    :param infile: input .hdf5 file
    :param outfile: output .cool file
    :param chrms_sizes: tab-separated file with chromosome lengths
    :param assembly: genome assembly (dm3 by default)
    :param correct: iteratively correct the heatmap? (True by default)

    :return: Python cooler object which was written to the file.
    """

    f = h5py.File(infile, 'r')

    chrms_idx = pickle.loads(f['genomeIdxToLabel'].value).keys()
    chrms_values = [pickle.loads(f['genomeIdxToLabel'].value)[x] for x in chrms_idx]
    chrms_starts = f['chromosomeStarts'].value
    mtx = np.zeros([len(f['chromosomeIndex'].value), len(f['chromosomeIndex'].value)])
    mtx.fill(np.nan)

    for k1 in chrms_idx:
        for k2 in chrms_idx:
            mtx_tmp = f['{0} {1}'.format(k1, k2)].value
            bgn1 = chrms_starts[k1]
            end1 = bgn1 + mtx_tmp.shape[0]
            bgn2 = chrms_starts[k2]
            end2 = bgn2 + mtx_tmp.shape[1]

            mtx[bgn1:end1, bgn2:end2] = mtx_tmp
            mtx[bgn1:end1, bgn2:end2] = mtx_tmp

    heatmap = mtx.copy()*factor

    chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(f['genomeIdxToLabel'].value).values() ))

    chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
    binsize = pickle.loads(f['resolution'].value)
    bins    = cooler.binnify(chromsizes, binsize)

    iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
    cooler.io.create(outfile, bins, iterator, assembly=assembly)

    c = cooler.Cooler(outfile)
    if correct:
        bias, stats = cooler.ice.iterative_correction(c, store=c)

    return c

def create_pairix(df_input, output):
    """ #columns: readID chr1 pos1 chr2 pos2 strand1 strand2 """
    df_all = df_input.copy()
    df_all = df_all.query("(chrom1!='chrM')&(chrom2!='chrM')")
    df_all.loc[:, 'cuts1'] = df_all.apply( lambda r: r.pos1-10 if r['strand1']=='+' else r.pos1+10, axis=1 )
    df_all.loc[:, 'cuts2'] = df_all.apply( lambda r: r.pos2-10 if r['strand2']=='+' else r.pos2+10, axis=1 )

    df_all[['readID', 'chrom1', 'cuts1', 'chrom2', 'cuts2', 'strand1', 'strand2']].to_csv(output, index=False, header=False, sep='\t')
    return df_all

def create_cooler(df, pairix_file, cool_mask, chr_sizes, resolutions_list=[20, 100]):

    create_pairix(df, pairix_file)

    for res in resolutions_list:
        command1 = "cooler csort -c1 2 -c2 4 -p1 3 -p2 5 {pairix} {chr_sizes}".format(pairix=pairix_file, chr_sizes=chr_sizes)
        command2 = "cooler cload pairix -p 4 {chr_sizes}:{res} {pairix}.blksrt.gz {output}".format(res=res*1000,
                                                                                   chr_sizes=chr_sizes,
                                                                                   output=cool_mask.format(res),
                                                                                   pairix=pairix_file
                                                                                  )
        call_and_check_errors(command1)
        call_and_check_errors(command2)