Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Not tested pseudocode
- import subprocess
- import h5py
- def call_and_check_errors(command):
- proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- shell=True, executable='/bin/bash')
- (stdout, stderr) = proc.communicate()
- print("Check stdout: {}".format(stdout))
- if stderr:
- print("Stderr is not empty. Might be an error in call_and_check_errors for the command: {}".format(command))
- print("Check stderr: {}".format(stderr))
- return stderr # Error, very bad!
- else:
- return 0 # No error, great!
- def hdf2cool(infile, outfile, chrms_sizes, assembly='dm3', correct=True):
- """
- This function converts hiclib .hdf5 to .cool file.
- Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.
- :param infile: input .hdf5 file
- :param outfile: output .cool file
- :param chrms_sizes: tab-separated file with chromosome lengths
- :param assembly: genome assembly (dm3 by default)
- :param correct: iteratively correct the heatmap? (True by default)
- :return: Python cooler object which was written to the file.
- """
- a = h5py.File(infile, 'r')
- heatmap = a['heatmap'].value
- chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(a['genomeIdxToLabel'].value).values() ))
- chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
- binsize = pickle.loads(a['resolution'].value)
- bins = cooler.binnify(chromsizes, binsize)
- iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
- cooler.io.create(outfile, bins, iterator, assembly=assembly)
- c = cooler.Cooler(outfile)
- if correct:
- bias, stats = cooler.ice.iterative_correction(c, store=c, max_iters=1000)
- return c
- def hdf2cool_bynum(infile, outfile, chrms_sizes, assembly='dm3', correct=True, factor=1):
- """
- This function converts hiclib .hdf5 to .cool file.
- Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.
- :param infile: input .hdf5 file
- :param outfile: output .cool file
- :param chrms_sizes: tab-separated file with chromosome lengths
- :param assembly: genome assembly (dm3 by default)
- :param correct: iteratively correct the heatmap? (True by default)
- :return: Python cooler object which was written to the file.
- """
- f = h5py.File(infile, 'r')
- chrms_idx = pickle.loads(f['genomeIdxToLabel'].value).keys()
- chrms_values = [pickle.loads(f['genomeIdxToLabel'].value)[x] for x in chrms_idx]
- chrms_starts = f['chromosomeStarts'].value
- mtx = np.zeros([len(f['chromosomeIndex'].value), len(f['chromosomeIndex'].value)])
- mtx.fill(np.nan)
- for k1 in chrms_idx:
- for k2 in chrms_idx:
- mtx_tmp = f['{0} {1}'.format(k1, k2)].value
- bgn1 = chrms_starts[k1]
- end1 = bgn1 + mtx_tmp.shape[0]
- bgn2 = chrms_starts[k2]
- end2 = bgn2 + mtx_tmp.shape[1]
- mtx[bgn1:end1, bgn2:end2] = mtx_tmp
- mtx[bgn1:end1, bgn2:end2] = mtx_tmp
- heatmap = mtx.copy()*factor
- chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(f['genomeIdxToLabel'].value).values() ))
- chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
- binsize = pickle.loads(f['resolution'].value)
- bins = cooler.binnify(chromsizes, binsize)
- iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
- cooler.io.create(outfile, bins, iterator, assembly=assembly)
- c = cooler.Cooler(outfile)
- if correct:
- bias, stats = cooler.ice.iterative_correction(c, store=c)
- return c
- def create_pairix(df_input, output):
- """ #columns: readID chr1 pos1 chr2 pos2 strand1 strand2 """
- df_all = df_input.copy()
- df_all = df_all.query("(chrom1!='chrM')&(chrom2!='chrM')")
- df_all.loc[:, 'cuts1'] = df_all.apply( lambda r: r.pos1-10 if r['strand1']=='+' else r.pos1+10, axis=1 )
- df_all.loc[:, 'cuts2'] = df_all.apply( lambda r: r.pos2-10 if r['strand2']=='+' else r.pos2+10, axis=1 )
- df_all[['readID', 'chrom1', 'cuts1', 'chrom2', 'cuts2', 'strand1', 'strand2']].to_csv(output, index=False, header=False, sep='\t')
- return df_all
- def create_cooler(df, pairix_file, cool_mask, chr_sizes, resolutions_list=[20, 100]):
- create_pairix(df, pairix_file)
- for res in resolutions_list:
- command1 = "cooler csort -c1 2 -c2 4 -p1 3 -p2 5 {pairix} {chr_sizes}".format(pairix=pairix_file, chr_sizes=chr_sizes)
- command2 = "cooler cload pairix -p 4 {chr_sizes}:{res} {pairix}.blksrt.gz {output}".format(res=res*1000,
- chr_sizes=chr_sizes,
- output=cool_mask.format(res),
- pairix=pairix_file
- )
- call_and_check_errors(command1)
- call_and_check_errors(command2)
Add Comment
Please, Sign In to add comment