Advertisement
ProzacR

pgkb-ngs_run.sh

Dec 3rd, 2018
752
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 6.74 KB | None | 0 0
  1. #!/usr/bin/env bash
  2.  
  3. # check input
  4. if test "$#" -ne 5; then
  5.     echo "Use: run.sh fasta1.gz fasta2.gz @RG\tID:fasta1.gz\tPL:ILLUMINA\tSM:1 working_dir output_prefix"
  6.     exit
  7. fi
  8.  
  9. # Get inputs, fasta pairs, RG and working directory and output name
  10. #FASTA1=$1
  11. #FASTA2=$2
  12. #RG=$3
  13. #WORKING_DIR=$4
  14. #NAME=$5
  15. FASTA1="test/fasta1.gz"
  16. FASTA2="test/fasta2.gz"
  17. RG="@RG\tID:fasta1.gz\tPL:ILLUMINA\tSM:1"
  18. WORKING_DIR="working_dir"
  19. NAME="test"
  20.  
  21.  
  22. GATK="/tools/gatk-4.0.11.0/gatk"
  23. PICARD="java -jar /tools/picard/build/libs/picard.jar"
  24. BWA="bwa mem"
  25.  
  26. REF='external_data/genome/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz'
  27. KNOWN_INDELS_MILLS='external_data/hg38bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'
  28. KNOWN_INDELS='external_data/hg38bundle/Homo_sapiens_assembly38.known_indels.vcf.gz'
  29. KNOWN_DBSNP='external_data/hg38bundle/dbsnp_144.hg38.vcf.gz' # for now same
  30. CURRENT_DBSNP='external_data/hg38bundle/dbsnp_144.hg38.vcf.gz'
  31. HAPMAP='external_data/hg38bundle/hapmap_3.3.hg38.vcf.gz'
  32. OMNI='external_data/hg38bundle/1000G_omni2.5.hg38.vcf.gz'
  33. HC1000G='external_data/hg38bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz'
  34.  
  35. echo "Running grc38 pipeline"
  36.  
  37. echo "Running BWA:"
  38. $BWA -o $WORKING_DIR/output.sam -t 20 $REF $FASTA1 $FASTA2 || exit 1
  39.  
  40. echo "indexing bam:"
  41. $PICARD BuildBamIndex INPUT=working_dir/output.bam OUTPUT=working_dir/output.aln.bam.bai || exit 1
  42.  
  43. echo "Moving files"
  44. mv $WORKING_DIR/output.aln.bam $WORKING_DIR/output.moved.bam || exit 1
  45. mv $WORKING_DIR/output.aln.bam.bai $WORKING_DIR/output.moved.bam.bai || exit 1
  46.  
  47. echo "Realigning"
  48. $GATK -T RealignerTargetCreator -I $WORKING_DIR/output.moved.bam -ip 100 -R $REF -nt 36 -o $WORKING_DIR/output.realigner.intervals -known $KNOWN_INDELS_MILLS -known $KNOWN_INDELS || exit 1
  49. $GATK -T IndelRealigner -I $WORKING_DIR/output.moved.bam -ip 100 -R $REF  -known $KNOWN_INDELS_MILLS -known $KNOWN_INDELS -targetIntervals $WORKING_DIR/output.realigner.intervals -o $WORKING_DIR/output.realigned.bam -filterNoBases || exit 1
  50.  
  51. echo "making dirs"
  52. mkdir $WORKING_DIR/stats
  53. mkdir $WORKING_DIR/tranches
  54.  
  55. echo "Recalibrating alignment"
  56. $GATK -T BaseRecalibrator -I $WORKING_DIR/output.realigned.bam -ip 100 -R $REF  -nct 36 -knownSites $KNOWN_INDELS_MILLS -knownSites $KNOWN_INDELS -knownSites $KNOWN_DBSNP -o $WORKING_DIR/output.recal.before.table || exit 1
  57. $GATK -T PrintReads -I $WORKING_DIR/output.realigned.bam -ip 100 -R $REF -nct 36 -o $WORKING_DIR/output.bam || exit 1
  58. $GATK -T BaseRecalibrator -I $WORKING_DIR/output.realigned.bam -ip 100 -R $REF  -BQSR $WORKING_DIR/output.recal.before.table -nct 36 -knownSites $KNOWN_INDELS_MILLS -knownSites $KNOWN_INDELS -knownSites $KNOWN_DBSNP -o $WORKING_DIR/output.recal.after.table || exit 1
  59. $PICARD CollectMultipleMetrics TMP_DIR=$WORKING_DIR/tmp VALIDATION_STRINGENCY=STRICT MAX_RECORDS_IN_RAM=8000000 INPUT=$WORKING_DIR/output.bam OUTPUT=$WORKING_DIR/stats/output.metrics PROGRAM= QualityScoreDistribution PROGRAM= MeanQualityByCycle PROGRAM= CollectAlignmentSummaryMetrics PROGRAM= CollectInsertSizeMetrics PROGRAM= CollectBaseDistributionByCycle || exit 1
  60.  
  61. echo "Finishing alignment"
  62.  
  63. echo "Running calling"
  64. $GATK -T HaplotypeCaller -I $WORKING_DIR/output.bam -ip 100 -R $REF -disable_auto_index_creation_and_locking_when_reading_rods -nct 16 -o $WORKING_DIR/output.g.vcf.gz  -ERC GVCF -hets 0.001 -indelHeterozygosity 1.25E-4 -gt_mode DISCOVERY -mbq 10 -maxReadsInRegionPerSample 1000 -minReadsPerAlignStart 5 -activeRegionExtension 100 -activeRegionMaxSize 300 || exit 1
  65.  
  66. echo "Genotyping gvcf"
  67. $GATK -T GenotypeGVCFs -ip 100 -R $REF -disable_auto_index_creation_and_locking_when_reading_rods -nt 16 -V $WORKING_DIR/output.g.vcf.gz -o $WORKING_DIR/$NAME.vcf.calls.vcf.gz -allSites -hets 0.001 -indelHeterozygosity 1.25E-4 -stand_call_conf 30.0 -stand_emit_conf 30.0 || exit 1
  68.  
  69. echo "Running annotator"
  70. $GATK -T VariantAnnotator -I $WORKING_DIR/output.bam -ip 100 -R $REF -disable_auto_index_creation_and_locking_when_reading_rods -nt 16 -V $WORKING_DIR/$NAME.vcf.calls.vcf.gz -D $CURRENT_DBSNP -o $WORKING_DIR/$NAME.vcf.annotated.vcf.gz  -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A FisherStrand -A StrandOddsRatio -A Coverage -A QualByDepth -A Coverage -A FisherStrand -A StrandOddsRatio -A ReadPosRankSumTest -A MappingQualityRankSumTest || exit 1
  71.  
  72. echo "Variant recalibration"
  73. $GATK  -T VariantRecalibrator  -ip 100  -R $REF  -disable_auto_index_creation_and_locking_when_reading_rods  -nt 16  -mode SNP  -mG 8  -input $WORKING_DIR/$NAME.vcf.annotated.vcf.gz  -resource:known=false,training=true,truth=true,prior=15.0 $HAPMAP -resource:known=false,training=true,truth=true,prior=12.0 $OMNI -resource:known=false,training=true,truth=false,prior=10.0 $HC1000G -resource:known=true,training=false,truth=false,prior=2.0 $KNOWN_DBSNP  -recalFile $WORKING_DIR/$NAME.vcf.snps.recal  -tranchesFile $WORKING_DIR/tranches/snps.tranches  -an MQ -an FS -an SOR -an DP -an QD  -tranche 100.0 -tranche 99.99 -tranche 99.9 -tranche 99.0 -tranche 90.0  -rscriptFile $WORKING_DIR/tranches/snps.R || exit 1
  74.  
  75. echo "Apply variant recalibration"
  76. $GATK  -T ApplyRecalibration  -ip 100  -R $REF  -disable_auto_index_creation_and_locking_when_reading_rods  -nt 16  -input $WORKING_DIR/$NAME.vcf.annotated.vcf.gz  -recalFile $WORKING_DIR/$NAME.vcf.snps.recal  -tranchesFile $WORKING_DIR/tranches/snps.tranches  -o $WORKING_DIR/$NAME.vcf.recal-snps.vcf.gz  -ts_filter_level 90.0  -mode SNP || exit 1
  77.  
  78.  
  79. echo "Indel recalibration"
  80. $GATK  -T VariantRecalibrator  -ip 100  -R $REF  -disable_auto_index_creation_and_locking_when_reading_rods  -nt 16  -mode INDEL  -mG 4  -input $WORKING_DIR/$NAME.vcf.recal-snps.vcf.gz  -resource:known=false,training=true,truth=true,prior=12.0 $KNOWN_INDELS_MILLS -resource:known=true,training=false,truth=false,prior=2.0 $KNOWN_DBSNP  -recalFile $WORKING_DIR/$NAME.vcf.indels.recal  -tranchesFile $WORKING_DIR/tranches/indels.tranches  -an QD -an FS -an SOR -an MQ  -tranche 100.0 -tranche 99.99 -tranche 99.9 -tranche 99.0 -tranche 90.0  -rscriptFile $WORKING_DIR/tranches/indels.R || exit 1
  81.  
  82. echo "Apply indel recalibration"
  83. $GATK  -T ApplyRecalibration  -ip 100  -R $REF  -disable_auto_index_creation_and_locking_when_reading_rods  -nt 16  -input $WORKING_DIR/$NAME.vcf.recal-snps.vcf.gz  -recalFile $WORKING_DIR/$NAME.vcf.indels.recal  -tranchesFile $WORKING_DIR/tranches/indels.tranches  -o $WORKING_DIR/$NAME.vcf.recal-indels.vcf.gz  -ts_filter_level 90.0  -mode INDEL || exit 1
  84.  
  85. echo "Analyze covariates"
  86. $GATK  -T AnalyzeCovariates  -ip 100  -R $REF  -before $WORKING_DIR/output.recal.before.table  -after $WORKING_DIR/output.recal.after.table  -plots $WORKING_DIR/stats/output.recal.pdf  -csv $WORKING_DIR/stats/output.recal.csv || exit 1
  87.  
  88.  
  89. #echo "Extract PGX genes for PharmCAT"
  90. #Select a sample and restrict the output vcf to a set of intervals:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement