Advertisement
Guest User

Untitled

a guest
Sep 16th, 2019
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.06 KB | None | 0 0
  1. import os
  2. import sys
  3. import csv
  4. from glob import glob
  5. import argparse
  6.  
  7. from micall.core.parse_interop import read_errors, write_phix_csv
  8. from micall.core.filter_quality import report_bad_cycles
  9. from micall.core.censor_fastq import censor
  10. from micall.core.prelim_map import prelim_map
  11. from micall.core.remap import remap
  12. from micall.core.sam2aln import sam2aln
  13. from micall.core.aln2counts import aln2counts
  14.  
  15.  
  16.  
  17. def parseArgs():
  18. parser = argparse.ArgumentParser(
  19. description='Use the MiCall pipeline to process gzip-comprssed FASTQ '
  20. 'file(s) for one sample.'
  21. )
  22.  
  23. parser.add_argument('path', help='Path to directory with FASTQ files')
  24. parser.add_argument('fastq1', nargs='?', help=argparse.SUPPRESS)
  25. parser.add_argument('fastq2', nargs='?',
  26. help=argparse.SUPPRESS)
  27.  
  28. parser.add_argument('--outdir', '-d', default=None, required=False,
  29. help='<optional> Path to write output files.')
  30. parser.add_argument('--unzipped', '-u', action='store_true', required=False,
  31. help='Set if the FASTQ file is not compressed.')
  32. parser.add_argument('--interop', '-i', type=argparse.FileType('rb'),
  33. required=False,
  34. help='<optional> Path to ErrorMetricsOut.bin interop file.')
  35. parser.add_argument('--readlen', '-l', type=int, default=251,
  36. help='<optional> Read length (default: 251nt).')
  37. parser.add_argument('--index', '-x', type=int, default=8,
  38. help='<optional> Index length (default: 8nt).')
  39.  
  40. return parser.parse_args()
  41.  
  42.  
  43. def get_prefix(args):
  44. """
  45. Get filename prefix for sample
  46. :param args:
  47. :return:
  48. """
  49. fn = os.path.basename(args.fastq1.name)
  50. prefix = None
  51. if args.unzipped:
  52. if fn.endswith('.fastq'):
  53. prefix = fn.replace('.fastq', '')
  54. else:
  55. if fn.endswith('.fastq.gz'):
  56. prefix = fn.replace('.fastq.gz', '')
  57.  
  58. if prefix is None:
  59. print('Error: {} has non-standard file extension'.format(args.fastq1))
  60. sys.exit()
  61.  
  62. return prefix
  63.  
  64.  
  65. def censor_fastqs(args, prefix):
  66. """
  67. The Illumina system generates a set of binary-encoded (InterOp) files
  68. that contain useful information about the run. One of these files, called
  69. ErrorMetricsOut.bin, contains information about the tile-cycle error rates
  70. as estimated from the phiX174 control sample. We use this report to
  71. identify tile-cycle combinations with excessive error rates.
  72. :param args: return value from argparse.ArgumentParser()
  73. :param prefix: filename stem
  74. :return: a new <args> object with .fastq1 and (optionally) .fastq2
  75. replaced by read-only file objects to censored FASTQs
  76. """
  77. # parse ErrorMetricsOut.bin
  78. lengths = [args.readlen, args.index, args.index, args.readlen]
  79. records = read_errors(args.interop)
  80. quality_csv = os.path.join(args.outdir, prefix + '.quality.csv')
  81. with open(quality_csv, 'w') as handle:
  82. write_phix_csv(out_file=handle, records=records, read_lengths=lengths)
  83.  
  84. # find bad tile-cycle combinations
  85. bad_cycles_csv = os.path.join(args.outdir, prefix + '.bad_cycles.csv')
  86. with open(quality_csv, 'r') as f1, open(bad_cycles_csv, 'w') as f2:
  87. report_bad_cycles(f1, f2)
  88.  
  89. bad_cycles = csv.DictReader(open(bad_cycles_csv, 'r'))
  90. cfastq1 = os.path.relpath(args.fastq1.name.replace('.fastq', '.censor.fastq'))
  91. censor(src=args.fastq1,
  92. bad_cycles_reader=bad_cycles,
  93. dest=open(cfastq1, 'wb'),
  94. use_gzip=not args.unzipped)
  95. args.fastq1 = open(cfastq1, 'rb') # replace original file
  96.  
  97. if args.fastq2:
  98. cfastq2 = os.path.relpath(args.fastq2.name.replace('.fastq', '.censor.fastq'))
  99. censor(args.fastq2, bad_cycles, open(cfastq2, 'wb'), not args.unzipped)
  100. args.fastq2 = open(cfastq2, 'rb')
  101.  
  102. return args
  103.  
  104.  
  105.  
  106. def run_sample(args):
  107. prefix = get_prefix(args)
  108. print('MiCall-Lite running sample {}...'.format(prefix))
  109.  
  110. if args.interop:
  111. print(' Censoring bad tile-cycle combos in FASTQ')
  112. args = censor_fastqs(args, prefix)
  113.  
  114. print(' Preliminary map')
  115. prelim_csv = os.path.join(args.outdir, prefix + '.prelim.csv')
  116. with open(prelim_csv, 'w') as handle:
  117. prelim_map(fastq1=args.fastq1.name,
  118. fastq2=args.fastq2.name if args.fastq2 else None,
  119. prelim_csv=handle,
  120. gzip=not args.unzipped)
  121.  
  122. print(' Iterative remap')
  123. remap_csv = os.path.join(args.outdir, prefix + '.remap.csv')
  124. with open(remap_csv, 'w') as handle:
  125. remap(fastq1=args.fastq1.name,
  126. fastq2=args.fastq2.name if args.fastq2 else None,
  127. prelim_csv=open(prelim_csv),
  128. remap_csv=handle,
  129. gzip=not args.unzipped)
  130.  
  131. print(' Generating alignment file')
  132. align_csv = os.path.join(args.outdir, prefix + '.align.csv')
  133. with open(align_csv, 'w') as handle:
  134. sam2aln(remap_csv=open(remap_csv),
  135. aligned_csv=handle)
  136.  
  137. print(' Generating count files')
  138. nuc_csv = os.path.join(args.outdir, prefix + '.nuc.csv')
  139. amino_csv = os.path.join(args.outdir, prefix + '.amino.csv')
  140. insert_csv = os.path.join(args.outdir, prefix + '.insert.csv')
  141. conseq_csv = os.path.join(args.outdir, prefix + '.conseq.csv')
  142. with open(nuc_csv, 'w') as handle:
  143. aln2counts(aligned_csv=open(align_csv),
  144. nuc_csv=handle,
  145. amino_csv=open(amino_csv, 'w'),
  146. coord_ins_csv=open(insert_csv, 'w'),
  147. conseq_csv=open(conseq_csv, 'w'))
  148.  
  149.  
  150. # batch process
  151. args = parseArgs()
  152. if args.outdir is None:
  153. args.outdir = os.getcwd()
  154.  
  155. files = glob('{}/*R1_001.fastq.gz'.format(args.path))
  156.  
  157. print("Detected {} gzipped FASTQ files at {}".format(len(files), args.path))
  158.  
  159.  
  160. for file in files:
  161. print(file)
  162. args.fastq1 = open(file, 'rb')
  163. file2 = file.replace('_R1_', '_R2_')
  164. if not os.path.exists(file2):
  165. print("ERROR: could not find second read file {}".format(file2))
  166. sys.exit()
  167.  
  168. args.fastq2 = open(file2, 'rb')
  169. run_sample(args)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement