Guest User

Untitled

a guest
Jun 20th, 2018
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.22 KB | None | 0 0
  1. #!/bin/sh
  2.  
  3. #software
  4. VCFHACKS=~/bin/vcfhacks/
  5. # EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py
  6. CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py
  7.  
  8. # dirs
  9. WORK=~/projects/bridge_study/vcfs/work/
  10. AITMAN=${WORK}/aitman/
  11. CAMBRIDGE=${WORK}/cambridge/
  12.  
  13. # files
  14. VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  15. VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  16. EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed
  17. HELICAL_BED=~/projects/bridge_study/helical_domains.bed
  18.  
  19.  
  20. ######################################################
  21. # sort_index()
  22. #
  23. # Sorts and Indexes the parsed vcf files.
  24. #
  25. ######################################################
  26.  
  27. sort_index(){
  28. for vcf in $@; do
  29. vcf-sort $vcf | bgzip -c > ${vcf}.gz
  30. tabix -p vcf ${vcf}.gz
  31. done
  32. }
  33.  
  34.  
  35. ####################################################
  36. # AIMS:
  37. # A. get all known pathogenic variants
  38. # A1: Variants in genes of interest.
  39. # A2: Filter on GT.Depth, GT.GQ and QUAL fields
  40. # A3. All known causitive variants
  41. # A4. LoF variants
  42. # A5. GLY-X-Y variants
  43. # B. get all VUS variants
  44. ####################################################
  45.  
  46.  
  47. for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do
  48.  
  49. # A1: Filter for variants in genes of interest only
  50. getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf
  51.  
  52. # A2: Convert samples with GT Depth < 20 or GQ > 20 to no calls and only keep variants
  53. # with QUAL > 20
  54. filterGts.pl -d 20 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.gt20.vcf
  55. getFunctionalVariants.pl \
  56. --input ${VCF}.eds_genes.gt20.vcf \
  57. --gq 20 \
  58. --var_qual 20 \
  59. --clinvar disable \
  60. --pass_filters \
  61. --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf
  62.  
  63. # A3: ALL KNOWN CAUSATIVE VARIANTS
  64. # get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields
  65. getFunctionalVariants.pl \
  66. --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  67. --clinvar all \
  68. --output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf
  69.  
  70. # A4: LOF VARIANTS
  71. getFunctionalVariants.pl \
  72. --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  73. --classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \
  74. --clinvar disable \
  75. --output ${VCF}.eds_genes.gt20.gq20.dp20.lof.vcf
  76.  
  77. # A5: GLY-X-Y VARIANTS
  78. # filter for variants in helical domains
  79. # NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'"
  80. $CSQ_QUERY \
  81. --vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  82. --helical_domains $HELICAL_BED \
  83. --out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf
  84.  
  85. # A6: merge all filtered VCF files together
  86. # concatenate and sort all filtered VCF from A2-A4
  87. DIRECTORY=`dirname $VCF`
  88. vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"`
  89. sort_index $vcfs
  90. compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs`
  91. vcf-concat $compressed_vcfs > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf
  92. sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf
  93. # remove duplicate variants
  94. awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  95.  
  96. # B: Identify all VUS variants
  97. # NOT COMPLETE. Need to filter out variants present in the pathogenic vcfs using filterVcfOnVcf.pl
  98. getFunctionalVariants.pl \
  99. --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  100. --clinvar disable \
  101. --cadd_filter 10 \
  102. --damaging 'sift=deleterious' \
  103. --damaging 'polyphen=probably_damaging' \
  104. --out ${VCF}.eds_genes.gt20.gq20.dp20.vus.vcf
  105.  
  106. done
  107.  
  108.  
  109. # Merge the aitman and cambridge pathogenic vcfs
  110. AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  111. CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  112. sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC
  113. vcf-merge ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf
Add Comment
Please, Sign In to add comment