Guest User

Untitled

a guest
Nov 17th, 2017
374
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.00 KB | None | 0 0
  1. #!/bin/bash
  2. #
  3. #SBATCH --job-name=test
  4. #SBATCH --ntasks=1
  5. #SBATCH --mem=8G
  6. #SBATCH --mail-user=vdp5@duke.edu
  7. #SBATCH -o /home/vdp5/slurm_out/jobout.txt
  8. SCRATCH=/data/wraycompute/vdp5/scratch/$SLURM_JOB_ID
  9. source /gpfs/fs0/home/vdp5/.bash_profile
  10. mkdir -p $SCRATCH
  11. cd $SCRATCH
  12. source ~/.bash_profile
  13. 4=$(realpath $4)
  14. echo $4
  15. module load jdk/1.8.0_45-fasrc01
  16. filenombre=$(basename $4)
  17. IFS='.' read -ra ADDR <<< $filenombre
  18. base=${ADDR[0]}
  19. ref=$3
  20.  
  21. module load R
  22.  
  23.  
  24. # necessary step for downstream processing. The validation stringency is necessary for malaria since its genome is full of contigs
  25. java -jar -Xmx7g $1 AddOrReplaceReadGroups INPUT=$4 OUTPUT=readgroups.bam RGID=1 RGLB=$base RGPL=illumina RGPU=unit1 RGSM=$base VALIDATION_STRINGENCY=LENIENT
  26.  
  27. # this step is necessary for GATK, but note that it is NOT for tools like freebayes since they do their own duplicates processing.
  28. java -jar -Xmx7g $1 MarkDuplicates INPUT=readgroups.bam OUTPUT=dedup_reads.bam METRICS_FILE=metrics.txt VALIDATION_STRINGENCY=LENIENT
  29. rm -rf readgroups.bam
  30. java -jar -Xmx7g $1 BuildBamIndex INPUT=dedup_reads.bam VALIDATION_STRINGENCY=LENIENT
  31.  
  32. # initial variant processing, using 4 cores. Note that we call full variants here, not the GVCF (which is what we want as the final output)
  33. java -jar -Xmx7g $2 -T HaplotypeCaller -R $ref -I dedup_reads.bam -o raw_variants.vcf -nct 4
  34.  
  35.  
  36. # we split up the variant types since SNPs and indels need different processing
  37. java -jar -Xmx7g $2 -T SelectVariants -R $ref -V raw_variants.vcf -selectType SNP -o raw_snps.vcf
  38. java -jar -Xmx7g $2 -T SelectVariants -R $ref -V raw_variants.vcf -selectType INDEL -o raw_indels.vcf
  39. rm -rf raw_variants.vcf
  40.  
  41. #basic filters
  42. java -jar -Xmx7g $2 -T VariantFiltration -R $ref -V raw_snps.vcf --filterExpression 'QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || SOR > 4.0' --filterName "basic_snp_filter" -o filtered_snps.vcf
  43. java -jar -Xmx7g $2 -T VariantFiltration -R $ref -V raw_indels.vcf --filterExpression 'QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0' --filterName "basic_indel_filter" -o filtered_indels.vcf
  44.  
  45. # Here is our first recalibration step. We use the hard filtered SNPs set as the training set. It's not an issue that this set isn't totally accurate because we're going to do more rounds of processing later.
  46. java -jar -Xmx7g $2 -T BaseRecalibrator -R $ref -I dedup_reads.bam -knownSites filtered_snps.vcf -knownSites filtered_indels.vcf -o recal_data.table
  47. java -jar -Xmx7g $2 -T PrintReads -R $ref -I dedup_reads.bam -BQSR recal_data.table -o recal_reads.bam
  48.  
  49. # we call the second set here using the recalculated reads (variants2)
  50. java -jar -Xmx7g $2 -T HaplotypeCaller -nct 4 -R $ref -I recal_reads.bam -o raw_variants2.vcf
  51.  
  52. #some cleanup
  53. rm -rf *.idx raw_snps.vf raw_indels.vcf recal_reads.bam recal_data.table post_recal_data.table filtered_snps.vcf filtered_indels.vcf
  54.  
  55.  
  56. java -jar -Xmx7g $2 -T SelectVariants -R $ref -V raw_variants2.vcf -selectType SNP -o raw_snps.vcf
  57. java -jar -Xmx7g $2 -T SelectVariants -R $ref -V raw_variants2.vcf -selectType INDEL -o raw_indels.vcf
  58. rm -rf raw_variants2.vcf
  59.  
  60.  
  61. java -jar -Xmx7g $2 -T VariantFiltration -R $ref -V raw_snps.vcf --filterExpression 'QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || SOR > 4.0' --filterName "basic_snp_filter" -o filtered_snps.vcf
  62. java -jar -Xmx7g $2 -T VariantFiltration -R $ref -V raw_indels.vcf --filterExpression 'QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0' --filterName "basic_indel_filter" -o filtered_indels.vcf
  63.  
  64. # really important to note that we use the dedup_reads file here as the input, not the printed reads from the last step. I have not tested what would happen if we used the recalculated reads here instead, but none of the guides I read seemed to do so.
  65. java -jar -Xmx7g $2 -T BaseRecalibrator -R $ref -I dedup_reads.bam -knownSites filtered_snps.vcf -knownSites filtered_indels.vcf -o recal_data.table
  66. java -jar -Xmx7g $2 -T PrintReads -R $ref -I dedup_reads.bam -BQSR recal_data.table -o recal_reads.bam
  67.  
  68. # round 3
  69. ja
Add Comment
Please, Sign In to add comment