Advertisement
Guest User

analysis

a guest
Jun 19th, 2018
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 4.24 KB | None | 0 0
  1. #!/bin/sh
  2.  
  3. # software
  4. VCFHACKS=~/bin/vcfhacks/
  5. # EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py
  6. CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py
  7.  
  8. # dirs
  9. WORK=~/projects/bridge_study/vcfs/work/
  10. AITMAN=${WORK}/aitman/
  11. CAMBRIDGE=${WORK}/cambridge/
  12.  
  13. # files
  14. VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  15. VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  16. EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed
  17. HELICAL_BED=~/projects/bridge_study/helical_domains.bed
  18.  
  19.  
  20. ######################################################
  21. # sort_index()
  22. #
  23. # Sorts and Indexes the parsed vcf files.
  24. #
  25. ######################################################
  26.  
  27. sort_index(){
  28.     for vcf in $@; do
  29.         vcf-sort $vcf | bgzip -c > ${vcf}.gz
  30.         tabix -p vcf ${vcf}.gz
  31.     done
  32. }
  33.  
  34.  
  35. ####################################################
  36. # AIMS:
  37. # A. get all known pathogenic variants
  38. #     A1: Variants in genes of interest.
  39. #     A2: Filter on GT.Depth, GT.GQ and QUAL fields
  40. #     A3. All known causitive variants
  41. #     A4. LoF variants
  42. #     A5. GLY-X-Y variants
  43. # B. get all VUS variants
  44. ####################################################
  45.  
  46.  
  47. for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do
  48.  
  49.     # A1: Filter for variants in genes of interest only
  50.     getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf
  51.  
  52.     # A2: Convert samples with GT Depth < 20 or GQ > 20 to no calls and only keep variants
  53.     #     with QUAL > 20
  54.     filterGts.pl -d 20 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.gt20.vcf
  55.     getFunctionalVariants.pl \
  56.           --input ${VCF}.eds_genes.gt20.vcf \
  57.       --gq 20 \
  58.       --var_qual 20 \
  59.           --clinvar disable \
  60.           --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf
  61.  
  62.     # A3: ALL KNOWN CAUSATIVE VARIANTS
  63.         # get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields
  64.     echo "\n\nIdentifying clinically significant variants\n\n"
  65.     getFunctionalVariants.pl \
  66.           --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  67.           --clinvar all \
  68.           --output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf
  69.  
  70.     # A4: LOF VARIANTS
  71.     echo "\n\nIdentifying LOF variants\n\n"
  72.     getFunctionalVariants.pl \
  73.       --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  74.       --classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \
  75.       --clinvar disable \
  76.       --output ${VCF}.eds_genes.gt20.gq20.dp20.lof.vcf
  77.  
  78.     # A5: GLY-X-Y VARIANTS
  79.     # filter for variants in helical domains
  80.     # NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'"
  81.     echo "\n\nIdentifying GLYXY variants\n\n"
  82.     $CSQ_QUERY \
  83.       --vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  84.       --helical_domains $HELICAL_BED \
  85.       --out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf
  86.  
  87.     # A6: merge all filtered VCF files together
  88.     # concatenate and sort all filtered VCF from A2-A4
  89.     echo "\n\nConcatenating VCFs\n\n"
  90.     DIRECTORY=`dirname $VCF`
  91.     vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"`
  92.     echo "\n\n\n$vcfs\n\n\n"
  93.     sort_index $vcfs
  94.     compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs`
  95.     vcf-concat $compressed_vcfs  > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf
  96.     echo "\n\nSorting VCFs"
  97.     sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf
  98.     # remove duplicate variants
  99.     echo "\n\nRemoving Duplicates\n\n"
  100.     awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf   
  101. done
  102.  
  103.  
  104. # Merge the aitman and cambridge pathogenic vcfs
  105. AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  106. CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  107. sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC
  108. echo "\n\n\nMerging Pathogenic VCFS\n\n\n"
  109. vcf-merge  ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement