jgcvhkb,k.

#!/usr/bin/env python

from __future__ import print_function, absolute_import, division
import os.path
import re
import sys
import sqlalchemy as sql
import numpy as np
import shutil
import uuid
import geneimpacts
import cyvcf2
import vcfnp
import math
import pysam

#python <yourscript>.py --i <input>.vcf.gz --o <output>.vcf.gz
#python myscript.py input_file output_file
#/ycga-gpfs/project/ysm/gunel/ky89/wes_data/call_sets/rawcalls_00051-00100/rawcalls/rawcalls.vcf.gz

#import sys
infile = sys.argv[1]
outfile = sys.argv[2]


#What you need to do in a python script that imports cyvcf2:
#1) read a VCF record line by line;
#2) parse a line (which should be done automatically by cyvcf2);
   #2a) if it is a header line, write it to the output VCF;
   #2b) add new header lines for corresponding new annotations you are going to add to INFO field and write it to the output VCF;
   #2c) if it is a variant record, proceed to 3.


#3) identify which sample is HomRef, Het, HomAlt and NoCall (let's refer to them as genotype-class);
#4) calculate mean GQ/GQbyDP, etc. per genotype-class (except NoCalls) or calculate mean DP from all samples but NoCalls, etc.;
#HOM_REF=0, HET=1. For gts012=True HOM_ALT=2, UKNOWN=3


#5) add the calculated results to the INFO field (there should be a defined method/setter to do this);
#6) write the line (with additional info fields) to the output VCF.

from cyvcf2 import VCF
vcf = VCF(infile)


#read the input file
myvcf=pysam.VariantFile(infile,"r")


# Add the -- field to header.
myvcf.header.formats.add("MDP0", ".", "Float", "Mean DP given genotype HOM_REF")
myvcf.header.formats.add("MDP1", ".", "Float", "Mean DP given genotype HET")
myvcf.header.formats.add("MDP2", ".", "Float", "Mean DP given genotype HOM_ALT")
myvcf.header.formats.add("MDP3", ".", "Float", "Mean DP given genotype UNKNOWN")
myvcf.header.formats.add("MGQ0", ".", "Float", "Mean GQ given genotype HOM_REF")
myvcf.header.formats.add("MGQ1", ".", "Float", "Mean GQ given genotype HET")
myvcf.header.formats.add("MGQ2", ".", "Float", "Mean GQ given genotype HOM_ALT")
myvcf.header.formats.add("MGQ3", ".", "Float", "Mean GQ given genotype UNKNOWN")
myvcf.header.formats.add("MGQD0", ".", "Float", "Mean GQD given genotype HOM_REF")
myvcf.header.formats.add("MGQD1", ".", "Float", "Mean GQD given genotype HET")
myvcf.header.formats.add("MGQD2", ".", "Float", "Mean GQD given genotype HOM_ALT")
myvcf.header.formats.add("MGQD3", ".", "Float", "Mean GQD given genotype UNKNOWN")


# create an object of new vcf file and open in to write data.
vcf_out = pysam.VariantFile(outfile, 'w', header=myvcf.header)

with open(outfile, "a") as out:

    dp_means = []
    gq_means = []
    gqd_means = []
    for cy_variant, my_variant in zip(vcf, myvcf):
        dp = np.array(cy_variant.format('DP').flatten())
        gq = cy_variant.format('GQ').flatten()
        gqd = cy_variant.format('GQD').flatten()
        a = cy_variant.gt_types
        dp_one, dp_two, dp_three, dp_zero = [], [], [], []
        gq_one, gq_two, gq_three, gq_zero = [], [], [], []
        gqd_one, gqd_two, gqd_three, gqd_zero = [], [], [], []
        for i, sample in enumerate(dp):
            if math.isnan(sample):
                continue
            if a[i] == 0:
                dp_zero.append(sample)
            elif a[i] == 1:
                dp_one.append(sample)
            elif a[i] == 2:
                dp_two.append(sample)
            elif a[i] == 3:
                dp_three.append(sample)
        for i, sample in enumerate(gq):
            if math.isnan(sample):
                continue
            if a[i] == 0:
                gq_zero.append(sample)
            elif a[i] == 1:
                gq_one.append(sample)
            elif a[i] == 2:
                gq_two.append(sample)
            elif a[i] == 3:
                gq_three.append(sample)
        for i, sample in enumerate(gqd):
            if math.isnan(sample):
                continue
            if a[i] == 0:
                dp_zero.append(sample)
            elif a[i] == 1:
                dp_one.append(sample)
            elif a[i] == 2:
                dp_two.append(sample)
            elif a[i] == 3:
                dp_three.append(sample)
        h = str(np.mean(dp_zero)) # this variant's mean of all dp samples with corresponding gt_type 0
        i = str(np.mean(dp_one)) # this variant's mean of all dp samples with corresponding gt_type 1
        j = str(np.mean(dp_two)) # etc
        k = str(np.mean(dp_three))
        l = str(np.mean(gq_zero))
        m = str(np.mean(gq_one))
        n = str(np.mean(gq_two))
        o = str(np.mean(gq_three))
        p = str(np.mean(gqd_zero))
        q = str(np.mean(gqd_one))
        r = str(np.mean(gqd_two))
        s = str(np.mean(gqd_three))
        my_variant.samples[sample]['MDP0'] = h
        my_variant.samples[sample]['MDP1'] = i
        my_variant.samples[sample]['MDP2'] = j
        my_variant.samples[sample]['MDP3'] = k
        my_variant.samples[sample]['MGQ0'] = l
        my_variant.samples[sample]['MGQ1'] = m
        my_variant.samples[sample]['MGQ2'] = n
        my_variant.samples[sample]['MGQ3'] = o
        my_variant.samples[sample]['MGQD0'] = p
        my_variant.samples[sample]['MGQD1'] = q
        my_variant.samples[sample]['MGQD2'] = r
        my_variant.samples[sample]['MGQD3'] = s
        out.write(str(my_variant))