Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.18 KB | None | 0 0
  1. import numpy as np
  2. import os
  3. import pysam
  4. import argparse
  5. import errno
  6.  
  7.  
  8. def main():
  9. parser = argparse.ArgumentParser(description="Splits BAM file into two BAM files, putting p reads into one BAM and (1-p) into the other")
  10. parser.add_argument('-b', '--bam-file', dest='bam_file', required=True, help='Input BAM file')
  11. parser.add_argument('-o1', '--output1', dest='output1', required=False, help='Output BAM file #1')
  12. parser.add_argument('-o2', '--output2', dest='output2', required=False, help='Output BAM file #1')
  13. parser.add_argument('-d', '--output-dir', dest='output_dir', required=False, help='Output directory')
  14. parser.add_argument('-p', '--parameter', dest='parameter', type=float, required=True, help='Parameter p of binomial distribution')
  15. args = parser.parse_args()
  16.  
  17. if not os.path.isfile(args.bam_file):
  18. print "ERROR: BAM file does not exist!"
  19. exit(-1)
  20.  
  21. if args.parameter < 0. or args.parameter > 1.:
  22. print "ERROR: parameter must be float value in range 0.0 <= p <= 1.0"
  23. exit(-1)
  24.  
  25. if not args.output1:
  26. output1_filename = os.path.splitext(args.bam_file)[0] + "_" + str(args.parameter) + os.path.splitext(args.bam_file)[1]
  27. else:
  28. output1_filename = args.output1
  29. if not args.output2:
  30. output2_filename = os.path.splitext(args.bam_file)[0] + "_" + str(1.0 - args.parameter) + os.path.splitext(args.bam_file)[1]
  31. else:
  32. output2_filename = args.output2
  33.  
  34. # if user specified output dir
  35. if args.output_dir:
  36. # if it already exists, we join paths
  37. if os.path.isdir(args.output_dir):
  38. output1_filename = os.path.join(args.output_dir, output1_filename)
  39. output2_filename = os.path.join(args.output_dir, output2_filename)
  40. # else try and create, then join
  41. else:
  42. try:
  43. os.makedirs(args.output_dir)
  44. output1_filename = os.path.join(args.output_dir, output1_filename)
  45. output2_filename = os.path.join(args.output_dir, output2_filename)
  46. except OSError as exception:
  47. if exception.errno != errno.EEXIST:
  48. raise
  49.  
  50. # open input BAM file for reading
  51. bam_f = pysam.AlignmentFile(args.bam_file, "rb")
  52. # open output BAM files for writing
  53. output1_f = pysam.AlignmentFile(output1_filename, "wb", template=bam_f)
  54. output2_f = pysam.AlignmentFile(output2_filename, "wb", template=bam_f)
  55.  
  56. num_reads = 0
  57. num_paired_reads = 0
  58. num_unpaired_reads = 0
  59. num_mate_missing = 0
  60. num_file1 = 0
  61. num_file2 = 0
  62. bam_iter = bam_f.fetch(until_eof=True)
  63. # for each read in BAM file
  64. for read in bam_iter:
  65. # sample binomial to choose which output BAM file to write to
  66. coin_flip = np.random.binomial(1, args.parameter)
  67. # if paired, get matching pair and write to output
  68. if read.is_paired and read.is_proper_pair:
  69. # if this is the first read in the pair, get second read
  70. if read.is_read1 and not read.mate_is_unmapped:
  71. bam_iter_tmp = bam_iter
  72. # get the read's mate
  73. try:
  74. read2 = bam_f.mate(read)
  75. if coin_flip:
  76. output1_f.write(read)
  77. output1_f.write(read2)
  78. num_file1 += 2
  79. else:
  80. output2_f.write(read)
  81. output2_f.write(read2)
  82. num_file2 += 2
  83. num_paired_reads += 2
  84. num_reads += 2
  85. # unless the mate is unavailable for some reason
  86. except ValueError:
  87. if coin_flip:
  88. output1_f.write(read)
  89. num_file1 += 1
  90. else:
  91. output2_f.write(read)
  92. num_file2 += 1
  93. num_mate_missing += 1
  94. num_reads += 1
  95. continue
  96. finally:
  97. bam_iter = bam_iter_tmp
  98. # else read is not paired, just throw it into a file randomly
  99. else:
  100. if coin_flip:
  101. output1_f.write(read)
  102. num_file1 += 1
  103. else:
  104. output2_f.write(read)
  105. num_file2 += 1
  106.  
  107. num_unpaired_reads += 1
  108. num_reads += 1
  109. if num_reads % 100000 == 0:
  110. print "Number of reads processed:", num_reads
  111.  
  112. bam_f.close()
  113. output1_f.close()
  114. output2_f.close()
  115. print "Total reads processed:", num_reads
  116. print "Number paired reads:", num_paired_reads, "\t(" + str(100.*num_paired_reads / float(num_reads)) + "%)"
  117. print "Number unpaired reads:", num_unpaired_reads, "\t(" + str(100.*num_unpaired_reads / float(num_reads)) + "%)"
  118. print "Number paired reads missing their mate:", num_mate_missing, "\t(" + str(100.*num_mate_missing / float(num_reads)) + "%)"
  119. print "Number of reads in first file:", num_file1, "\t(" + str(100.*num_file1 / float(num_reads)) + "%)"
  120. print "Number of reads in second file:", num_file2, "\t(" + str(100.*num_file2 / float(num_reads)) + "%)"
  121.  
  122. if __name__ == "__main__":
  123. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement