Guest User

Untitled

a guest
May 26th, 2018
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 28.58 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. """Need to ensure a folder exists for the particular sample containing the trimmed fastqs.
  4. This is required as VICUNA's input is a directory that needs to contain only fastqs that
  5. require assembly."""
  6.  
  7. import os
  8. import sys
  9. import subprocess
  10. import argparse
  11. import re
  12. import logging
  13. import csv
  14. from xml.dom import minidom
  15. import xml.etree.ElementTree as ET
  16.  
  17.  
  18. def main():
  19. sample = sys.argv[1]
  20. print sample
  21.  
  22. os.chdir(sample)
  23. print 'Current dir', os.getcwd()
  24.  
  25. assembly_dir = 'assembly/'
  26. os.makedirs(assembly_dir)
  27. print 'Assembly dir', assembly_dir
  28.  
  29. hq_fq1 = '%s_hq_1.fastq' % (sample)
  30. hq_fq2 = '%s_hq_2.fastq' % (sample)
  31. filt_fq1 = 'assembly/%s_filtered_1.fastq' % (sample)
  32. filt_fq2 = 'assembly/%s_filtered_2.fastq' % (sample)
  33. db = '/home/kieren/result_dir/hg38_hcv_k15_s3' # define HCV db filepath
  34. hcvfasta = '/home/kieren/UCL_IVA/Data/hcv.fasta'
  35. contigs = 'assembly/%s.contigs.fasta' % (sample)
  36. best_ref_fasta = '%s-ref.fasta' % (sample)
  37. lastz_path = '/home/kieren/lastz-distrib/bin/lastz'
  38. lastz_analysed_file = 'assembly/lastz_analysed_file'
  39.  
  40. human_filtering(sample, db, hq_fq1, hq_fq2, filt_fq1, filt_fq2)
  41. print 'Filtering', filt_fq1
  42. split_pops(sample, filt_fq1, filt_fq2)
  43. print 'Splitpops', filt_fq1
  44. denovo_assembly(sample)
  45. print 'De novo', filt_fq1
  46. find_best_ref(sample, hcvfasta)
  47. print 'Find bestref', filt_fq1
  48. assemble_draft(sample, lastz_path, best_ref_fasta)
  49. print 'Assemble draft', filt_fq1
  50. contig_map(sample, contigs, filt_fq1, filt_fq2, best_ref_fasta, lastz_analysed_file)
  51. print 'Contig map', filt_fq1
  52.  
  53.  
  54. def human_filtering(sample, db, hq_fq1, hq_fq2, filt_fq1, filt_fq2):
  55. """Requires HCV smalt database index path and trimmed fastq files"""
  56.  
  57. pairs_sam = '%s_pairs.sam' % (sample)
  58.  
  59. with open(hq_fq1) as hq1:
  60. for i, l in enumerate(hq1):
  61. pass
  62.  
  63. fw_reads = (i + 1) / 4
  64.  
  65. print 'Trimmed forward reads', fw_reads
  66.  
  67. with open(hq_fq2) as hq2:
  68. for i, l in enumerate(hq2):
  69. pass
  70.  
  71. rev_reads = (i + 1) / 4
  72.  
  73. print 'Trimmed reverse reads', rev_reads
  74.  
  75. with open(pairs_sam, 'wb') as sam:
  76. # Run smalt alignment
  77. try:
  78. p1 = subprocess.Popen(['smalt', 'map', '-x',
  79. '-y', '0.5',
  80. '-i', '500',
  81. '-n', '8',
  82. db, hq_fq1, hq_fq2],
  83. stdout=subprocess.PIPE,
  84. stderr=subprocess.PIPE)
  85.  
  86. # Filter reads using awk
  87. p2 = subprocess.Popen(['awk', """{if ($3 !~ /^chr/ && $7 !~ /^chr/) print $0}"""],
  88. # Filter reads using grep
  89. # p2 = subprocess.Popen(['grep', '-v', 'SN:chr'],
  90. # Pass smalt stdout to grep stdin
  91. stdin=p1.stdout,
  92. # Pass stdout to sam file
  93. stdout=sam,
  94. stderr=subprocess.PIPE)
  95. # Close file handle for sam file
  96. p1.stdout.close()
  97.  
  98. (stdoutdata, stderrdata) = p2.communicate()
  99. if stdoutdata:
  100. print 'smaltrc ', p1.returncode
  101. print 'greprc ', p2.returncode
  102. print "SUCCESS ", stdoutdata
  103. if stderrdata:
  104. print 'smaltrc ', p1.returncode
  105. print 'greprc ', p2.returncode
  106. print "ERROR ", stderrdata.strip()
  107.  
  108. except OSError as e:
  109. print "OSError > ", e.errno
  110. print "OSError > ", e.strerror
  111. print "OSError > ", e.filename
  112. except:
  113. print "Error > ", sys.exc_info()[0]
  114.  
  115. for flag, fq in zip(('64', '128'), (filt_fq1, filt_fq2)):
  116. with open(fq, 'wb') as openfq:
  117. bam = '%s_%s_pairs.bam' % (sample, flag)
  118. # Changed phred from forward 64 to 33 and reverse 128 to 33
  119. # Samtools view to convert SAM to BAM file
  120. try:
  121. p1 = subprocess.Popen(['samtools', 'view', '-bhf', flag, pairs_sam],
  122. stdout=subprocess.PIPE,
  123. stderr=subprocess.PIPE)
  124.  
  125. print flag
  126.  
  127. p2 = subprocess.Popen(['samtools', 'bam2fq', '-'],
  128. stdin=p1.stdout,
  129. stdout=openfq,
  130. stderr=subprocess.PIPE)
  131.  
  132. (stdoutdata, stderrdata) = p2.communicate()
  133. if stdoutdata:
  134. print 'view_fqrc ', p1.returncode
  135. print 'bamtofqrc ', p2.returncode
  136. print "SUCCESS ", stdoutdata
  137. if stderrdata:
  138. print 'view_fqrc ', p1.returncode
  139. print 'bamtofqrc ', p2.returncode
  140. print "ERROR ", stderrdata.strip()
  141.  
  142. except OSError as e:
  143. print "OSError > ", e.errno
  144. print "OSError > ", e.strerror
  145. print "OSError > ", e.filename
  146. except:
  147. print "Error > ", sys.exc_info()[0]
  148.  
  149. # Count the number of reads in the human filtered fastqs
  150.  
  151. with open(filt_fq1) as f1:
  152. for i, l in enumerate(f1):
  153. pass
  154.  
  155. filt_fw_reads = (i + 1) / 4
  156.  
  157. print 'Filtered forward reads', filt_fw_reads
  158.  
  159. with open(filt_fq2) as f2:
  160. for i, l in enumerate(f2):
  161. pass
  162.  
  163. filt_rev_reads = (i + 1) / 4
  164.  
  165. print 'Filtered reverse reads', filt_rev_reads
  166.  
  167. result = ET.Element('results')
  168. items = ET.SubElement(result, 'trimming')
  169. item1 = ET.SubElement(items, 'fw_reads')
  170. item2 = ET.SubElement(items, 'rev_reads')
  171. item3 = ET.SubElement(items, 'filt_fw_reads')
  172. item4 = ET.SubElement(items, 'filt_rev_reads')
  173. item1.set('result', 'fwd trimmed reads')
  174. item2.set('result', 'rev trimmed reads')
  175. item3.set('result', 'fwd filtered reads')
  176. item4.set('result', 'rev filtered reads')
  177. item1.text = str(fw_reads)
  178. item2.text = str(rev_reads)
  179. item3.text = str(filt_fw_reads)
  180. item4.text = str(filt_rev_reads)
  181.  
  182. xmldata = ET.tostring(result)
  183. samplexml = '%s_trim_filter.xml' % sample
  184. xmlfile = open(samplexml, 'w')
  185. xmlfile.write(xmldata)
  186.  
  187.  
  188. def split_pops(sample, filt_fq1, filt_fq2):
  189. # ###SPLIT POPULATIONS###
  190.  
  191. # Essential that this directory is completely separate as splitpops cleans up the directory and deletes all input files
  192. split_dir = 'splitpops/'
  193. os.makedirs(split_dir)
  194.  
  195. splitpops = '/home/kieren/Snork6/Snork-PHE/0.6/src/snork.py'
  196. target_ref = '/phengs/hpc_software/ucl_assembly/new_hcvrefset'
  197. filt_bam = '%s_filtered.bam' % (sample)
  198.  
  199. # #Convert filtered fastqs into bam file
  200. try:
  201. p1 = subprocess.Popen(['java', '-jar',
  202. '/phengs/hpc_software/picard-tools/1.111/FastqToSam.jar',
  203. 'F1=' + filt_fq1,
  204. 'F2=' + filt_fq2,
  205. 'O=' + filt_bam,
  206. 'SM=' + sample],
  207. stdout=subprocess.PIPE,
  208. stderr=subprocess.PIPE)
  209.  
  210. (stdoutdata, stderrdata) = p1.communicate()
  211. if stdoutdata:
  212. print "Fastq2bamrc ", p1.returncode
  213. print "SUCCESS ", stdoutdata
  214.  
  215. if stderrdata:
  216. print "Fastq2bamrc ", p1.returncode
  217. print "ERROR ", stderrdata.strip()
  218.  
  219. except OSError as e:
  220. print "OSError > ", e.errno
  221. print "OSError > ", e.strerror
  222. print "OSError > ", e.filename
  223. except:
  224. print "Error > ", sys.exc_info()[0]
  225.  
  226.  
  227. try:
  228. p2 = subprocess.Popen([splitpops, 'splitpops',
  229. '-bin', 'snorktest/src6',
  230. '-profile', 'None',
  231. '-config', '/phengs/hpc_software/ucl_assembly/snork.config.wtchg',
  232. '-orgid', 'Hepc',
  233. '-dataid', sample,
  234. '-samplename', sample,
  235. '-bampath', filt_bam,
  236. '-targetrefid', 'wtchgR00000071',
  237. '-targetrefpath', target_ref,
  238. '-outdir', split_dir,
  239. '-logdir', split_dir,
  240. '-overwrite', 'False',
  241. '-deleteints', 'True',
  242. '-verbosity', 'DEBUG'],
  243.  
  244. stdout=subprocess.PIPE,
  245. stderr=subprocess.PIPE)
  246.  
  247. (stdoutdata, stderrdata) = p2.communicate()
  248. if stdoutdata:
  249. print "splitpopsrc ", p2.returncode
  250. print "SUCCESS ", stdoutdata
  251. if stderrdata:
  252. print "splitpopsrc ", p2.returncode
  253. print "ERROR ", stderrdata.strip()
  254.  
  255. except OSError as e:
  256. print "OSError > ", e.errno
  257. print "OSError > ", e.strerror
  258. print "OSError > ", e.filename
  259. except:
  260. print "Error > ", sys.exc_info()[0]
  261.  
  262. split_stats = 'splitpops/%s_filtered.targetpop.stats.txt' % (sample)
  263.  
  264. # Create dictionary of genotypes(k) and percentage reads(v) and return the highest two
  265. d = {}
  266. with open(split_stats, 'r') as s:
  267. # Skips the header line and only returns the top 2 lines (Highest scoring genotype matches)
  268. lines = s.readlines()[1:3]
  269. main_geno = lines[0].split()
  270. main_geno_name = main_geno[0]
  271. main_geno_perc = main_geno[2]
  272. second_geno = lines[1].split()
  273. second_geno_name = second_geno[0]
  274. second_geno_perc = second_geno[2]
  275.  
  276. result = ET.Element('results')
  277. items = ET.SubElement(result, 'genotypes')
  278. item1 = ET.SubElement(items, 'genotype1')
  279. item2 = ET.SubElement(items, 'genotype2')
  280. item1.set('result', main_geno_name)
  281. item2.set('result', second_geno_name)
  282. item1.text = str(main_geno_perc)
  283. item2.text = str(second_geno_perc)
  284.  
  285. xmldata = ET.tostring(result)
  286. samplexml = '%s_splitpops.xml' % sample
  287. xmlfile = open(samplexml, 'w')
  288. xmlfile.write(xmldata)
  289.  
  290. def denovo_assembly(sample):
  291. """De novo assembly using VICUNA requires the filtered fastq files following human and non-HCV sequence removal"""
  292.  
  293. ###Starting VICUNA###
  294. # Prepare vicuna config file for specific sample
  295.  
  296. # Run VICUNA on sample specific config file
  297. vicuna_config = '/phengs/hpc_software/ucl_assembly/vicuna_config.txt'
  298. with open(vicuna_config) as vc:
  299. cwd = os.getcwd()
  300. s = vc.read().replace('*DIR*', cwd).replace('*SAMPLE*', sample)
  301.  
  302. vicuna_config_sample = 'vicuna_config_%s.txt' % (sample)
  303. with open(vicuna_config_sample, 'w') as vc:
  304. vc.write(s)
  305.  
  306. # Run VICUNA with the new sample config file
  307. p1 = subprocess.Popen(['vicuna', vicuna_config_sample],
  308. stdout=subprocess.PIPE,
  309. stderr=subprocess.PIPE)
  310.  
  311. (stdoutdata, stderrdata) = p1.communicate()
  312. print stderrdata
  313. print stdoutdata
  314. print 'vicunarc ', p1.returncode
  315.  
  316. # Following completion of VICUNA, the 'dg-0' needs to be replaced by sample name and
  317. # contig number starting at 1 as VICUNA starts at 0
  318. # and each contig needs a specific whole number for the hash later in the pipeline
  319.  
  320. contigs = 'assembly/contig.fasta'
  321. newdata = []
  322.  
  323. # Open contigs file for reading and replacing header line with new information on sample and sensible contig number
  324. with open(contigs) as c:
  325. for line in c.readlines():
  326. if line.startswith('>dg-'):
  327. start_line = re.search("^\S*", line).group(0)
  328. element = start_line.split('-')
  329.  
  330. # Returns just the number from the match object reflecting the VICUNA contig number
  331. num = element[1]
  332. print num
  333.  
  334. # Adds an increment of 1 to indicate new contig numbering
  335. count = int(num) + 1
  336. # Number of contigs
  337. count = str(count)
  338. newdata.append(re.sub(r'dg-.*', sample + '.' + count, line).strip())
  339. # print newdata
  340. else:
  341. # print line.strip()
  342. newdata.append(line.strip())
  343.  
  344.  
  345. result = ET.Element('results')
  346.  
  347. if int(count) == 0:
  348. print 'No contig'
  349. items = ET.SubElement(result, 'Contigs')
  350. item1 = ET.SubElement(items, 'number of contigs')
  351. item1.set('Contig value', 'number')
  352. item1.text = 'No contigs'
  353.  
  354. elif int(count) == 1:
  355. print 'Single contig'
  356. items = ET.SubElement(result, 'Contigs')
  357. item2 = ET.SubElement(items, 'number of contigs')
  358. item2.set('Contig value', 'number')
  359. item2.text = 'Single contig'
  360.  
  361. else:
  362. print 'Number of contigs =', count
  363. items = ET.SubElement(result, 'Contigs')
  364. item3 = ET.SubElement(items, 'number of contigs')
  365. item3.set('Contig value', 'number')
  366. item3.text = str(count)
  367.  
  368. print '\n'.join(newdata)
  369.  
  370. xmldata = ET.tostring(result)
  371. samplexml = '%s_denovo.xml' % sample
  372. xmlfile = open(samplexml, 'w')
  373. xmlfile.write(xmldata)
  374.  
  375. # Very important that this file is named correctly as lastz_analyser.WITH_REVCOMP.pl script will fail later in the pipeline.
  376. # It must be named samplename.contigs.fasta
  377. contigs_increment = 'assembly/%s.contigs.fasta' % (sample)
  378.  
  379. with open(contigs_increment, 'w') as ci:
  380. ci.write('\n'.join(newdata))
  381.  
  382.  
  383. def find_best_ref(sample, hcvfasta):
  384. """Following completion of de novo assembly, the best reference sequence from the HCV database needs to
  385. be selected for mapping to create a draft assembly"""
  386.  
  387. # Path to current available lastz executable
  388. lastz_path = '/home/kieren/lastz-distrib/bin/lastz'
  389. # Path to contigs fasta file with the specification of [multiple] required by lastz if multiple contigs
  390. contigs = 'assembly/%s.contigs.fasta[multiple]' % (sample)
  391. # Path to lastz output file
  392. contigs_lastz = 'assembly/contig.lastz'
  393.  
  394. with open(contigs_lastz, 'w') as cz:
  395. # Run LASTZ
  396. p1 = subprocess.Popen([lastz_path, contigs, hcvfasta,
  397. '--ambiguous=iupac',
  398. '--format=GENERAL'],
  399. # Pass stdout to contigs lastz file
  400. stdout=cz,
  401. stderr=subprocess.PIPE)
  402.  
  403. (stdoutdata, stderrdata) = p1.communicate()
  404. print stderrdata
  405. print stdoutdata
  406. print 'lastzrc ', p1.returncode
  407.  
  408. lastz_log = 'assembly/lastz_besthit.log'
  409. lastz_bestref = '/phengs/hpc_software/ucl_assembly/lastz_bestref.pl'
  410. best_ref_fasta = '%s-ref.fasta' % (sample)
  411.  
  412. # Run lastz_bestref.pl to find the best matching reference sequence. The perl script requires string concatenation between the flags
  413. # and arguments (+) for these to run
  414. p1 = subprocess.Popen(['perl', '-s', lastz_bestref,
  415. '-contig_lastz=' + contigs_lastz,
  416. '-blastdb=' + hcvfasta,
  417. '-best_ref_fasta=' + best_ref_fasta,
  418. '-lastz_best_hit_log=' + lastz_log],
  419. stdout=subprocess.PIPE,
  420. stderr=subprocess.PIPE)
  421.  
  422. (stdoutdata, stderrdata) = p1.communicate()
  423. print 'BEST REF STDERR', stderrdata
  424. print 'BEST REF STDOUT', stdoutdata
  425. print 'lastz_best_refrc ', p1.returncode
  426.  
  427.  
  428. def assemble_draft(sample, lastz_path, best_ref_fasta):
  429. """The best reference has been defined, now assemble the draft genome"""
  430.  
  431. contigs = 'assembly/%s.contigs.fasta' % (sample)
  432. contigs_bestref = 'assembly/contigs-vs-bestref.lav'
  433.  
  434. with open(contigs_bestref, 'w') as cb:
  435. # Run lastz to compare contigs to best reference. NOTE: NEED TO ENSURE THE FUNCTION ARGUMENTS ARE PASSED FROM PREVIOUS CORRECTLY
  436. p1 = subprocess.Popen([lastz_path, best_ref_fasta, contigs,
  437. '--ambiguous=iupac'],
  438. # Pass stdout to contigs_vs_bestref.lav file
  439. stdout=cb,
  440. stderr=subprocess.PIPE)
  441.  
  442. (stdoutdata, stderrdata) = p1.communicate()
  443. print 'Contigs vs bestref STDERR', stderrdata
  444. print 'Contigs vs bestref STDOUT', stdoutdata
  445. print 'Contigs_vs_best_refrc ', p1.returncode
  446.  
  447. lastz_analyser_rev = '/phengs/hpc_software/ucl_assembly/lastz_analyser.WITH_REVCOMP.pl'
  448. lastz_analysed_file = 'assembly/lastz_analysed_file'
  449. lastz_analysed_log = 'assembly/lastz_analyser.log'
  450.  
  451. p2 = subprocess.Popen(['perl', '-w', '-s', lastz_analyser_rev,
  452. '-reference_fasta_file=' + best_ref_fasta,
  453. '-sample_fasta_file=' + contigs,
  454. '-lastz_results_file=' + contigs_bestref,
  455. '-cutoff=50000',
  456. '-with_revcomp=yes',
  457. '-output=' + lastz_analysed_file,
  458. '-log_file=' + lastz_analysed_log],
  459. # Pass stdout to contigs lastz file
  460. stdout=subprocess.PIPE,
  461. stderr=subprocess.PIPE)
  462.  
  463. (stdoutdata, stderrdata) = p2.communicate()
  464. print 'lastz analyser STDERR', stderrdata
  465. print 'lastz analyser STDOUT', stdoutdata
  466. print 'lastz_analyserrc ', p2.returncode
  467.  
  468.  
  469. def contig_map(sample, contigs, filt_fq1, filt_fq2, best_ref_fasta, lastz_analysed_file):
  470. ## map reads using BWA MEM to contigs to work out which sequences to choose when contigs overlap
  471.  
  472. contigs_sam = 'assembly/%s.contigs.sam' % (sample)
  473. contigs_bam = 'assembly/%s.contigs.bam' % (sample)
  474. contigs_sorted = 'assembly/%s.contigs.sorted' % (sample)
  475. contigs_sorted_final = 'assembly/%s.contigs.sorted.bam' % (sample)
  476. contigs_mpileup = 'assembly/%s.contigs.mpileup' % (sample)
  477. genome_maker = '/phengs/hpc_software/ucl_assembly/genome_maker2b.pl'
  478. genome_fasta = 'assembly/%s-genome.fasta' % (sample)
  479. genome_maker_log = 'assembly/%s_genome_maker.log' % (sample)
  480. genome_sam = 'assembly/%s-genome.sam' % (sample)
  481. genome_bam = 'assembly/%s-genome.bam' % (sample)
  482. genome_sorted = 'assembly/%s-genome.sorted' % (sample)
  483. genome_sorted_final = 'assembly/%s-genome.sorted.bam' % (sample)
  484. genome_mpileup = 'assembly/%s-genome.mpileup' % (sample)
  485. cons_mv = '/phengs/hpc_software/ucl_assembly/cons_mv.pl'
  486. cons_pren = 'assembly/%s.consensus1.preNcut.fasta' % (sample)
  487. genome_mv = 'assembly/%s-genome.fasta.mv' % (sample)
  488. genome_basefreq = 'assembly/%s-genome.fasta.basefreqs.tsv' % (sample)
  489. n_remover = '/phengs/hpc_software/ucl_assembly/N_remover_from_consensus.pl'
  490. cons1_fasta = 'assembly/%s.consensus1.fasta' % (sample)
  491. cons1_sam = 'assembly/%s.consensus1.sam' % (sample)
  492. cons1_bam = 'assembly/%s.consensus1.bam' % (sample)
  493. cons1_sorted = 'assembly/%s.consensus1.sorted' % (sample)
  494. cons1_sorted_final = 'assembly/%s.consensus1.sorted.bam' % (sample)
  495. cons1_mpileup = 'assembly/%s.consensus1.mpileup' % (sample)
  496. cons2_pren = 'assembly/%s.consensus2.preNcut.fasta' % (sample)
  497. cons1_mv = 'assembly/%s.consensus1.fasta.mv' % (sample)
  498. cons1_basefreq = 'assembly/%s.consensus1.fasta.basefreqs.tsv' % (sample)
  499. cons2_fasta = 'assembly/%s.consensus2.fasta' % (sample)
  500. majvar = '/phengs/hpc_software/ucl_assembly/majvarcheck2_bwa.pl'
  501.  
  502. cmd_gm = ['perl', '-w', '-s', genome_maker,
  503. '-sample_pileup_file=' + contigs_mpileup,
  504. '-contigs=' + contigs,
  505. '-reference_mapped_consensus=' + best_ref_fasta,
  506. '-lastz_analysed_file=' + lastz_analysed_file,
  507. '-ref_correct_start=0',
  508. '-ref_correct_stop=20000',
  509. '-output=' + genome_fasta,
  510. '-logfile=' + genome_maker_log]
  511.  
  512. for fasta, sam, bam, sorted, final, mpileup in zip((contigs, genome_fasta, cons1_fasta),
  513. (contigs_sam, genome_sam, cons1_sam),
  514. (contigs_bam, genome_bam, cons1_bam),
  515. (contigs_sorted, genome_sorted, cons1_sorted),
  516. (contigs_sorted_final, genome_sorted_final, cons1_sorted_final),
  517. (contigs_mpileup, genome_mpileup, cons1_mpileup)):
  518.  
  519. p1 = subprocess.Popen(['bwa', 'index', fasta],
  520. stdout=subprocess.PIPE,
  521. stderr=subprocess.PIPE)
  522.  
  523. (stdoutdata, stderrdata) = p1.communicate()
  524. print 'bwa index STDERR', stderrdata
  525. print 'bwa index STDOUT', stdoutdata
  526. print 'bwarc ', p1.returncode
  527.  
  528. with open(sam, 'w') as s:
  529. p1 = subprocess.Popen(['bwa', 'mem',
  530. '-t', '8',
  531. fasta,
  532. filt_fq1,
  533. filt_fq2],
  534. stdout=s,
  535. stderr=subprocess.PIPE)
  536.  
  537. (stdoutdata, stderrdata) = p1.communicate()
  538. print 'bwa mem STDERR', stderrdata
  539. # print 'bwa mem STDOUT', stdoutdata
  540. print 'bwamemrc ', p1.returncode
  541.  
  542. p2 = subprocess.Popen(['samtools', 'view', '-bhS',
  543. '-o', bam, sam],
  544. stdin=p1.stdout,
  545. stdout=subprocess.PIPE,
  546. stderr=subprocess.PIPE)
  547.  
  548. (stdoutdata, stderrdata) = p2.communicate()
  549. print 'samtools view STDERR', stderrdata
  550. print 'samtools view STDOUT', stdoutdata
  551. print 'samtoolsviewrc ', p2.returncode
  552.  
  553. p3 = subprocess.Popen(['samtools', 'sort',
  554. '-@', '8', bam, sorted],
  555. stdout=subprocess.PIPE,
  556. stderr=subprocess.PIPE)
  557.  
  558. (stdoutdata, stderrdata) = p3.communicate()
  559. print 'samtools sort STDERR', stderrdata
  560. print 'samtools sort STDOUT', stdoutdata
  561. print 'samtoolssortrc ', p3.returncode
  562.  
  563. p4 = subprocess.Popen(['samtools', 'index', final],
  564. stdout=subprocess.PIPE,
  565. stderr=subprocess.PIPE)
  566.  
  567. (stdoutdata, stderrdata) = p4.communicate()
  568. print 'samtools index STDERR', stderrdata
  569. print 'samtools index STDOUT', stdoutdata
  570. print 'samtoolsindexrc ', p4.returncode
  571.  
  572. p5 = subprocess.Popen(['samtools', 'mpileup', '-f',
  573. fasta,
  574. '-d', '1000000',
  575. '-o', mpileup, final],
  576. stdout=subprocess.PIPE,
  577. stderr=subprocess.PIPE)
  578.  
  579. (stdoutdata, stderrdata) = p5.communicate()
  580. print 'samtools mpileup STDERR', stderrdata
  581. # print 'samtools mpileup STDOUT', stdoutdata
  582. print 'samtoolsmpileuprc ', p5.returncode
  583.  
  584. # Helpful method for checking exactly what is sent to the command line (subprocess.list2cmdline)
  585. if fasta == contigs:
  586. p6 = subprocess.Popen(cmd_gm, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  587.  
  588. # Print the full command line argument
  589. print 'fasta is contigs'
  590.  
  591. (stdoutdata, stderrdata) = p6.communicate()
  592. print 'gm or cons STDERR', stderrdata
  593. print 'gm or cons STDOUT', stdoutdata
  594. print 'gm or cons rc ', p6.returncode
  595.  
  596. elif fasta == genome_fasta:
  597.  
  598. p7 = subprocess.Popen(['perl', '-w', '-s', cons_mv,
  599. '-mpileup=' + genome_mpileup,
  600. '-reference_fasta=' + genome_fasta,
  601. '-mv_freq_cutoff=0.01',
  602. '-mv_variant_depth_cutoff=20',
  603. '-cons_depth_cutoff=80',
  604. '-sliding_window_size=300',
  605. '-consensus_out=' + cons_pren,
  606. '-mv_out=' + genome_mv,
  607. '-base_freq_out=' + genome_basefreq],
  608.  
  609. stdout=subprocess.PIPE,
  610. stderr=subprocess.PIPE)
  611.  
  612. print 'fasta is genome'
  613. (stdoutdata, stderrdata) = p7.communicate()
  614. print 'Consmv STDERR', stderrdata
  615. print 'Consmv STDOUT', stdoutdata
  616. print 'Consmv rc ', p7.returncode
  617.  
  618. p8 = subprocess.Popen(['perl', '-w', '-s', n_remover,
  619. '-cutoff=46',
  620. cons_pren],
  621.  
  622. stdout=open(cons1_fasta, 'w'),
  623. stderr=subprocess.PIPE)
  624.  
  625. (stdoutdata, stderrdata) = p8.communicate()
  626. print 'N_remover STDERR', stderrdata
  627. print 'N_remover STDOUT', stdoutdata
  628. print 'N_remover rc ', p8.returncode
  629.  
  630. else:
  631.  
  632. p9 = subprocess.Popen(['perl', '-w', '-s', cons_mv,
  633. '-mpileup=' + cons1_mpileup,
  634. '-reference_fasta=' + cons1_fasta,
  635. '-mv_freq_cutoff=0.01',
  636. '-mv_overall_depth_cutoff=100',
  637. '-mv_variant_depth_cutoff=20',
  638. '-cons_depth_cutoff=80',
  639. '-sliding_window_size=300',
  640. '-consensus_out=' + cons2_pren,
  641. '-mv_out=' + cons1_mv,
  642. '-base_freq_out=' + cons1_basefreq],
  643.  
  644. stdout=subprocess.PIPE,
  645. stderr=subprocess.PIPE)
  646.  
  647. print 'fasta is consensus'
  648. (stdoutdata, stderrdata) = p9.communicate()
  649. print 'Consmv STDERR', stderrdata
  650. print 'Consmv STDOUT', stdoutdata
  651. print 'Consmv rc ', p9.returncode
  652.  
  653. p10 = subprocess.Popen(['perl', '-w', '-s', n_remover,
  654. '-cutoff=46',
  655. cons2_pren],
  656.  
  657. stdout=open(cons2_fasta, 'w'),
  658. stderr=subprocess.PIPE)
  659.  
  660. (stdoutdata, stderrdata) = p10.communicate()
  661. print 'N_remover STDERR', stderrdata
  662. print 'N_remover STDOUT', stdoutdata
  663. print 'N_remover rc ', p10.returncode
  664.  
  665. p11 = subprocess.Popen(['perl', '-w', '-s', majvar,
  666. '-mvpath=' + cons1_mv,
  667. '-basefreq=' + cons1_basefreq,
  668. '-fwdreads=' + filt_fq1,
  669. '-revreads=' + filt_fq2],
  670. stdout=subprocess.PIPE,
  671. stderr=subprocess.PIPE)
  672.  
  673. (stdoutdata, stderrdata) = p11.communicate()
  674. print 'Majvar STDERR', stderrdata
  675. print 'Majvar STDOUT', stdoutdata
  676. print 'Majvar rc ', p11.returncode
  677.  
  678. p12 = subprocess.Popen(['samtools', 'view',
  679. '-S',
  680. '-F0x4',
  681. '-c', contigs_sam],
  682. stdout=subprocess.PIPE,
  683. stderr=subprocess.PIPE)
  684.  
  685. (stdoutdata, stderrdata) = p12.communicate()
  686. print 'samtools view mapcount STDERR', stderrdata
  687. print 'samtools view mapcount STDOUT', stdoutdata
  688. print 'samtools view mapcount rc ', p12.returncode
  689.  
  690. result = ET.Element('results')
  691. items = ET.SubElement(result, 'mapping')
  692. item1 = ET.SubElement(items, 'mapping')
  693. item1.set('result', 'reads mapped to contigs')
  694. item1.text = str(stdoutdata)
  695.  
  696. xmldata = ET.tostring(result)
  697. samplexml = '%s_mapping.xml' % sample
  698. xmlfile = open(samplexml, 'w')
  699. xmlfile.write(xmldata)
  700.  
  701.  
  702. if __name__ == '__main__':
  703. main()
Add Comment
Please, Sign In to add comment