Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/sh
- #software
- VCFHACKS=~/bin/vcfhacks/
- # EXPRESSION=~/projects/bridge_study/vcfs/work/expression.py
- CSQ_QUERY=~/projects/bridge_study/vcfs/work/csq_query.py
- # dirs
- WORK=~/projects/bridge_study/vcfs/work/
- AITMAN=${WORK}/aitman/
- CAMBRIDGE=${WORK}/cambridge/
- # files
- VCF_AITMAN=${AITMAN}/vep.var.eds_09-06-16.aitman.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
- VCF_CAMBRIDGE=${CAMBRIDGE}/vep.var._16-06-16.cambridge.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored
- EDS_BED=~/projects/bridge_study/updated_reportable_genes_GRCh37.bed
- HELICAL_BED=~/projects/bridge_study/helical_domains.bed
- ######################################################
- # sort_index()
- #
- # Sorts and Indexes the parsed vcf files.
- #
- ######################################################
- sort_index(){
- for vcf in $@; do
- vcf-sort $vcf | bgzip -c > ${vcf}.gz
- tabix -p vcf ${vcf}.gz
- done
- }
- ####################################################
- # AIMS:
- # A. get all known pathogenic variants
- # A1: Variants in genes of interest.
- # A2: Filter on GT.Depth, GT.GQ and QUAL fields
- # A3. All known causitive variants
- # A4. LoF variants
- # A5. GLY-X-Y variants
- # B. get all VUS variants
- ####################################################
- for VCF in $VCF_AITMAN $VCF_CAMBRIDGE; do
- # A1: Filter for variants in genes of interest only
- getVariantsByLocation.pl -b $EDS_BED -i ${VCF}.vcf -o ${VCF}.eds_genes.vcf
- # A2: Convert samples with GT Depth < 20 or GQ > 20 to no calls and only keep variants
- # with QUAL > 20
- filterGts.pl -d 20 -i ${VCF}.eds_genes.vcf -o ${VCF}.eds_genes.gt20.vcf
- getFunctionalVariants.pl \
- --input ${VCF}.eds_genes.gt20.vcf \
- --gq 20 \
- --var_qual 20 \
- --clinvar disable \
- --pass_filters \
- --output ${VCF}.eds_genes.gt20.gq20.dp20.vcf
- # A3: ALL KNOWN CAUSATIVE VARIANTS
- # get Clinically Significant variants from ClinVarPathogenic and AS_CLNSIG fields
- getFunctionalVariants.pl \
- --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
- --clinvar all \
- --output ${VCF}.eds_genes.gt20.gq20.dp20.clin_sig.vcf
- # A4: LOF VARIANTS
- getFunctionalVariants.pl \
- --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
- --classes splice_acceptor_variant splice_donor_variant frameshift_variant stop_gained \
- --clinvar disable \
- --output ${VCF}.eds_genes.gt20.gq20.dp20.lof.vcf
- # A5: GLY-X-Y VARIANTS
- # filter for variants in helical domains
- # NOTE: getFunctionalVariants.pl --eval_filter won't work with "symbol eq 'COL1A2'"
- $CSQ_QUERY \
- --vcf ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
- --helical_domains $HELICAL_BED \
- --out ${VCF}.eds_genes.gt20.gq20.dp20.helical_domain.vcf
- # A6: merge all filtered VCF files together
- # concatenate and sort all filtered VCF from A2-A4
- DIRECTORY=`dirname $VCF`
- vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf"`
- sort_index $vcfs
- compressed_vcfs=`find $DIRECTORY -name "*.eds_genes.gt20.gq20.dp20*.vcf.gz" | xargs`
- vcf-concat $compressed_vcfs > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf
- sortVcf.pl -i ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.vcf -o ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf
- # remove duplicate variants
- awk -F '\t' '/^#/ {print;prev="";next;} {key=sprintf("%s\t%s\t%s",$1,$2,$3,$4,$5,$6);if(key==prev) next;print;prev=key;}' ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.vcf > ${VCF}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
- # B: Identify all VUS variants
- # NOT COMPLETE. Need to filter out variants present in the pathogenic vcfs using filterVcfOnVcf.pl
- getFunctionalVariants.pl \
- --input ${VCF}.eds_genes.gt20.gq20.dp20.vcf \
- --clinvar disable \
- --cadd_filter 10 \
- --damaging 'sift=deleterious' \
- --damaging 'polyphen=probably_damaging' \
- --out ${VCF}.eds_genes.gt20.gq20.dp20.vus.vcf
- done
- # Merge the aitman and cambridge pathogenic vcfs
- AITMAN_VCF_PATHOGENIC=${VCF_AITMAN}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
- CAMBRIDGE_VCF_PATHOGENIC=${VCF_CAMBRIDGE}.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.vcf
- sort_index $AITMAN_VCF_PATHOGENIC $CAMBRIDGE_VCF_PATHOGENIC
- vcf-merge ${AITMAN_VCF_PATHOGENIC}.gz ${CAMBRIDGE_VCF_PATHOGENIC}.gz > ${WORK}/eds_bridge.vep.var.ts99pt9.vcf_snp1pc_evs1pc_exac1pc.caddscored.eds_genes.gt20.gq20.dp20.pathogenic.sorted.uniq.combined.vcf
Add Comment
Please, Sign In to add comment