Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from __future__ import print_function, absolute_import, division
- import os.path
- import re
- import sys
- import sqlalchemy as sql
- import numpy as np
- import shutil
- import uuid
- import geneimpacts
- import cyvcf2
- import vcfnp
- import math
- import pysam
- #python <yourscript>.py --i <input>.vcf.gz --o <output>.vcf.gz
- #python myscript.py input_file output_file
- #/ycga-gpfs/project/ysm/gunel/ky89/wes_data/call_sets/rawcalls_00051-00100/rawcalls/rawcalls.vcf.gz
- #import sys
- infile = sys.argv[1]
- outfile = sys.argv[2]
- #What you need to do in a python script that imports cyvcf2:
- #1) read a VCF record line by line;
- #2) parse a line (which should be done automatically by cyvcf2);
- #2a) if it is a header line, write it to the output VCF;
- #2b) add new header lines for corresponding new annotations you are going to add to INFO field and write it to the output VCF;
- #2c) if it is a variant record, proceed to 3.
- #3) identify which sample is HomRef, Het, HomAlt and NoCall (let's refer to them as genotype-class);
- #4) calculate mean GQ/GQbyDP, etc. per genotype-class (except NoCalls) or calculate mean DP from all samples but NoCalls, etc.;
- #HOM_REF=0, HET=1. For gts012=True HOM_ALT=2, UKNOWN=3
- #5) add the calculated results to the INFO field (there should be a defined method/setter to do this);
- #6) write the line (with additional info fields) to the output VCF.
- from cyvcf2 import VCF
- vcf = VCF(infile)
- #read the input file
- myvcf=pysam.VariantFile(infile,"r")
- # Add the -- field to header.
- myvcf.header.formats.add("MDP0", ".", "Float", "Mean DP given genotype HOM_REF")
- myvcf.header.formats.add("MDP1", ".", "Float", "Mean DP given genotype HET")
- myvcf.header.formats.add("MDP2", ".", "Float", "Mean DP given genotype HOM_ALT")
- myvcf.header.formats.add("MDP3", ".", "Float", "Mean DP given genotype UNKNOWN")
- myvcf.header.formats.add("MGQ0", ".", "Float", "Mean GQ given genotype HOM_REF")
- myvcf.header.formats.add("MGQ1", ".", "Float", "Mean GQ given genotype HET")
- myvcf.header.formats.add("MGQ2", ".", "Float", "Mean GQ given genotype HOM_ALT")
- myvcf.header.formats.add("MGQ3", ".", "Float", "Mean GQ given genotype UNKNOWN")
- myvcf.header.formats.add("MGQD0", ".", "Float", "Mean GQD given genotype HOM_REF")
- myvcf.header.formats.add("MGQD1", ".", "Float", "Mean GQD given genotype HET")
- myvcf.header.formats.add("MGQD2", ".", "Float", "Mean GQD given genotype HOM_ALT")
- myvcf.header.formats.add("MGQD3", ".", "Float", "Mean GQD given genotype UNKNOWN")
- # create an object of new vcf file and open in to write data.
- vcf_out = pysam.VariantFile(outfile, 'w', header=myvcf.header)
- with open(outfile, "a") as out:
- dp_means = []
- gq_means = []
- gqd_means = []
- for cy_variant, my_variant in zip(vcf, myvcf):
- dp = np.array(cy_variant.format('DP').flatten())
- gq = cy_variant.format('GQ').flatten()
- gqd = cy_variant.format('GQD').flatten()
- a = cy_variant.gt_types
- dp_one, dp_two, dp_three, dp_zero = [], [], [], []
- gq_one, gq_two, gq_three, gq_zero = [], [], [], []
- gqd_one, gqd_two, gqd_three, gqd_zero = [], [], [], []
- for i, sample in enumerate(dp):
- if math.isnan(sample):
- continue
- if a[i] == 0:
- dp_zero.append(sample)
- elif a[i] == 1:
- dp_one.append(sample)
- elif a[i] == 2:
- dp_two.append(sample)
- elif a[i] == 3:
- dp_three.append(sample)
- for i, sample in enumerate(gq):
- if math.isnan(sample):
- continue
- if a[i] == 0:
- gq_zero.append(sample)
- elif a[i] == 1:
- gq_one.append(sample)
- elif a[i] == 2:
- gq_two.append(sample)
- elif a[i] == 3:
- gq_three.append(sample)
- for i, sample in enumerate(gqd):
- if math.isnan(sample):
- continue
- if a[i] == 0:
- dp_zero.append(sample)
- elif a[i] == 1:
- dp_one.append(sample)
- elif a[i] == 2:
- dp_two.append(sample)
- elif a[i] == 3:
- dp_three.append(sample)
- h = str(np.mean(dp_zero)) # this variant's mean of all dp samples with corresponding gt_type 0
- i = str(np.mean(dp_one)) # this variant's mean of all dp samples with corresponding gt_type 1
- j = str(np.mean(dp_two)) # etc
- k = str(np.mean(dp_three))
- l = str(np.mean(gq_zero))
- m = str(np.mean(gq_one))
- n = str(np.mean(gq_two))
- o = str(np.mean(gq_three))
- p = str(np.mean(gqd_zero))
- q = str(np.mean(gqd_one))
- r = str(np.mean(gqd_two))
- s = str(np.mean(gqd_three))
- my_variant.samples[sample]['MDP0'] = h
- my_variant.samples[sample]['MDP1'] = i
- my_variant.samples[sample]['MDP2'] = j
- my_variant.samples[sample]['MDP3'] = k
- my_variant.samples[sample]['MGQ0'] = l
- my_variant.samples[sample]['MGQ1'] = m
- my_variant.samples[sample]['MGQ2'] = n
- my_variant.samples[sample]['MGQ3'] = o
- my_variant.samples[sample]['MGQD0'] = p
- my_variant.samples[sample]['MGQD1'] = q
- my_variant.samples[sample]['MGQD2'] = r
- my_variant.samples[sample]['MGQD3'] = s
- out.write(str(my_variant))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement