Advertisement
Guest User

Untitled

a guest
Jun 19th, 2018
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 3.92 KB | None | 0 0
  1. #!/bin/sh
  2.  
  3. # software
  4. VCFHACKS=~/bin/vcfhacks/
  5. # EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py
  6. CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py
  7.  
  8. # dirs
  9. WORK=~/projects/bridge_study/vcfs/work/
  10. AITMAN=${WORK}/aitman/
  11. CAMBRIDGE=${WORK}/cambridge/
  12.  
  13. # files
  14. VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  15. VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
  16. EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed
  17. HELICAL_BED=~/projects/bridge_study/helical_domains.bed
  18.  
  19.  
  20. ######################################################
  21. # sort_index()
  22. #
  23. # Sorts and Indexes the parsed vcf files.
  24. #
  25. ######################################################
  26.  
  27. sort_index(){
  28.     for vcf in $@; do
  29.         vcf-sort $vcf | bgzip -c > ${vcf}.gz
  30.         tabix -p vcf ${vcf}.gz
  31.     done
  32. }
  33.  
  34.  
  35. ####################################################
  36. # AIMS:
  37. # A. get all known pathogenic variants
  38. #     A1: Variants in genes of interest.
  39. #     A2: Filter on GT.Depth, GT.GQ and QUAL fields
  40. #     A3. All known causitive variants
  41. #     A4. LoF variants
  42. #     A5. GLY-X-Y variants
  43. # B. get all VUS variants
  44. ####################################################
  45.  
  46.  
  47. for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do
  48.  
  49.     # A1: Filter for variants in genes of interest only
  50.     getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf
  51.  
  52.     # A2: Convert samples with GT Depth < 20 or GQ > 20 to no calls and only keep variants
  53.     #     with QUAL > 20
  54.     filterGts.pl -d 20 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.gt20.vcf
  55.     getFunctionalVariants.pl \
  56.           --input ${VCF}.eds_genes.gt20.vcf \
  57.       --gq 20 \
  58.       --var_qual 20 \
  59.           --clinvar disable \
  60.           --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf
  61.  
  62.     # A3: ALL KNOWN CAUSATIVE VARIANTS
  63.     # get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields
  64.     getFunctionalVariants.pl \
  65.           --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  66.           --clinvar all \
  67.           --output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf
  68.  
  69.     # A4: LOF VARIANTS
  70.     getFunctionalVariants.pl \
  71.       --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  72.       --classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \
  73.       --clinvar disable \
  74.       --output ${VCF}.eds_genes.gt20.gq20.dp20.lof.vcf
  75.  
  76.     # A5: GLY-X-Y VARIANTS
  77.     # filter for variants in helical domains
  78.     # NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'"
  79.     $CSQ_QUERY \
  80.       --vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
  81.       --helical_domains $HELICAL_BED \
  82.       --out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf
  83.  
  84.     # A6: merge all filtered VCF files together
  85.     # concatenate and sort all filtered VCF from A2-A4
  86.     DIRECTORY=`dirname $VCF`
  87.     vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"`
  88.     sort_index $vcfs
  89.     compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs`
  90.     vcf-concat $compressed_vcfs  > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf
  91.     sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf
  92.     # remove duplicate variants
  93.     awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf   
  94. done
  95.  
  96.  
  97. # Merge the aitman and cambridge pathogenic vcfs
  98. AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  99. CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
  100. sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC
  101. vcf-merge  ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement