Guest User

Untitled

a guest
Dec 12th, 2018
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.18 KB | None | 0 0
  1. # Not tested pseudocode
  2.  
  3. import subprocess
  4. import h5py
  5.  
  6. def call_and_check_errors(command):
  7.  
  8. proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
  9. shell=True, executable='/bin/bash')
  10. (stdout, stderr) = proc.communicate()
  11. print("Check stdout: {}".format(stdout))
  12. if stderr:
  13. print("Stderr is not empty. Might be an error in call_and_check_errors for the command: {}".format(command))
  14. print("Check stderr: {}".format(stderr))
  15. return stderr # Error, very bad!
  16. else:
  17. return 0 # No error, great!
  18.  
  19. def hdf2cool(infile, outfile, chrms_sizes, assembly='dm3', correct=True):
  20. """
  21. This function converts hiclib .hdf5 to .cool file.
  22. Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.
  23.  
  24. :param infile: input .hdf5 file
  25. :param outfile: output .cool file
  26. :param chrms_sizes: tab-separated file with chromosome lengths
  27. :param assembly: genome assembly (dm3 by default)
  28. :param correct: iteratively correct the heatmap? (True by default)
  29.  
  30. :return: Python cooler object which was written to the file.
  31. """
  32.  
  33. a = h5py.File(infile, 'r')
  34. heatmap = a['heatmap'].value
  35.  
  36. chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(a['genomeIdxToLabel'].value).values() ))
  37.  
  38. chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
  39. binsize = pickle.loads(a['resolution'].value)
  40. bins = cooler.binnify(chromsizes, binsize)
  41.  
  42. iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
  43. cooler.io.create(outfile, bins, iterator, assembly=assembly)
  44.  
  45. c = cooler.Cooler(outfile)
  46. if correct:
  47. bias, stats = cooler.ice.iterative_correction(c, store=c, max_iters=1000)
  48.  
  49. return c
  50.  
  51. def hdf2cool_bynum(infile, outfile, chrms_sizes, assembly='dm3', correct=True, factor=1):
  52. """
  53. This function converts hiclib .hdf5 to .cool file.
  54. Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.
  55.  
  56. :param infile: input .hdf5 file
  57. :param outfile: output .cool file
  58. :param chrms_sizes: tab-separated file with chromosome lengths
  59. :param assembly: genome assembly (dm3 by default)
  60. :param correct: iteratively correct the heatmap? (True by default)
  61.  
  62. :return: Python cooler object which was written to the file.
  63. """
  64.  
  65. f = h5py.File(infile, 'r')
  66.  
  67. chrms_idx = pickle.loads(f['genomeIdxToLabel'].value).keys()
  68. chrms_values = [pickle.loads(f['genomeIdxToLabel'].value)[x] for x in chrms_idx]
  69. chrms_starts = f['chromosomeStarts'].value
  70. mtx = np.zeros([len(f['chromosomeIndex'].value), len(f['chromosomeIndex'].value)])
  71. mtx.fill(np.nan)
  72.  
  73. for k1 in chrms_idx:
  74. for k2 in chrms_idx:
  75. mtx_tmp = f['{0} {1}'.format(k1, k2)].value
  76. bgn1 = chrms_starts[k1]
  77. end1 = bgn1 + mtx_tmp.shape[0]
  78. bgn2 = chrms_starts[k2]
  79. end2 = bgn2 + mtx_tmp.shape[1]
  80.  
  81. mtx[bgn1:end1, bgn2:end2] = mtx_tmp
  82. mtx[bgn1:end1, bgn2:end2] = mtx_tmp
  83.  
  84. heatmap = mtx.copy()*factor
  85.  
  86. chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(f['genomeIdxToLabel'].value).values() ))
  87.  
  88. chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
  89. binsize = pickle.loads(f['resolution'].value)
  90. bins = cooler.binnify(chromsizes, binsize)
  91.  
  92. iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
  93. cooler.io.create(outfile, bins, iterator, assembly=assembly)
  94.  
  95. c = cooler.Cooler(outfile)
  96. if correct:
  97. bias, stats = cooler.ice.iterative_correction(c, store=c)
  98.  
  99. return c
  100.  
  101. def create_pairix(df_input, output):
  102. """ #columns: readID chr1 pos1 chr2 pos2 strand1 strand2 """
  103. df_all = df_input.copy()
  104. df_all = df_all.query("(chrom1!='chrM')&(chrom2!='chrM')")
  105. df_all.loc[:, 'cuts1'] = df_all.apply( lambda r: r.pos1-10 if r['strand1']=='+' else r.pos1+10, axis=1 )
  106. df_all.loc[:, 'cuts2'] = df_all.apply( lambda r: r.pos2-10 if r['strand2']=='+' else r.pos2+10, axis=1 )
  107.  
  108. df_all[['readID', 'chrom1', 'cuts1', 'chrom2', 'cuts2', 'strand1', 'strand2']].to_csv(output, index=False, header=False, sep='\t')
  109. return df_all
  110.  
  111. def create_cooler(df, pairix_file, cool_mask, chr_sizes, resolutions_list=[20, 100]):
  112.  
  113. create_pairix(df, pairix_file)
  114.  
  115. for res in resolutions_list:
  116. command1 = "cooler csort -c1 2 -c2 4 -p1 3 -p2 5 {pairix} {chr_sizes}".format(pairix=pairix_file, chr_sizes=chr_sizes)
  117. command2 = "cooler cload pairix -p 4 {chr_sizes}:{res} {pairix}.blksrt.gz {output}".format(res=res*1000,
  118. chr_sizes=chr_sizes,
  119. output=cool_mask.format(res),
  120. pairix=pairix_file
  121. )
  122. call_and_check_errors(command1)
  123. call_and_check_errors(command2)
Add Comment
Please, Sign In to add comment