Advertisement
Guest User

jgcvhkb,k.

a guest
Jan 19th, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.43 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from __future__ import print_function, absolute_import, division
  4. import os.path
  5. import re
  6. import sys
  7. import sqlalchemy as sql
  8. import numpy as np
  9. import shutil
  10. import uuid
  11. import geneimpacts
  12. import cyvcf2
  13. import vcfnp
  14. import math
  15. import pysam
  16.  
  17. #python <yourscript>.py --i <input>.vcf.gz --o <output>.vcf.gz
  18. #python myscript.py input_file output_file
  19. #/ycga-gpfs/project/ysm/gunel/ky89/wes_data/call_sets/rawcalls_00051-00100/rawcalls/rawcalls.vcf.gz
  20.  
  21. #import sys
  22. infile = sys.argv[1]
  23. outfile = sys.argv[2]
  24.  
  25.  
  26. #What you need to do in a python script that imports cyvcf2:
  27. #1) read a VCF record line by line;
  28. #2) parse a line (which should be done automatically by cyvcf2);
  29.    #2a) if it is a header line, write it to the output VCF;
  30.    #2b) add new header lines for corresponding new annotations you are going to add to INFO field and write it to the output VCF;
  31.    #2c) if it is a variant record, proceed to 3.
  32.  
  33.    
  34.  
  35. #3) identify which sample is HomRef, Het, HomAlt and NoCall (let's refer to them as genotype-class);
  36. #4) calculate mean GQ/GQbyDP, etc. per genotype-class (except NoCalls) or calculate mean DP from all samples but NoCalls, etc.;
  37. #HOM_REF=0, HET=1. For gts012=True HOM_ALT=2, UKNOWN=3
  38.  
  39.  
  40. #5) add the calculated results to the INFO field (there should be a defined method/setter to do this);
  41. #6) write the line (with additional info fields) to the output VCF.
  42.  
  43. from cyvcf2 import VCF
  44. vcf = VCF(infile)
  45.  
  46.  
  47. #read the input file
  48. myvcf=pysam.VariantFile(infile,"r")
  49.  
  50.  
  51. # Add the -- field to header.
  52. myvcf.header.formats.add("MDP0", ".", "Float", "Mean DP given genotype HOM_REF")
  53. myvcf.header.formats.add("MDP1", ".", "Float", "Mean DP given genotype HET")
  54. myvcf.header.formats.add("MDP2", ".", "Float", "Mean DP given genotype HOM_ALT")
  55. myvcf.header.formats.add("MDP3", ".", "Float", "Mean DP given genotype UNKNOWN")
  56. myvcf.header.formats.add("MGQ0", ".", "Float", "Mean GQ given genotype HOM_REF")
  57. myvcf.header.formats.add("MGQ1", ".", "Float", "Mean GQ given genotype HET")
  58. myvcf.header.formats.add("MGQ2", ".", "Float", "Mean GQ given genotype HOM_ALT")
  59. myvcf.header.formats.add("MGQ3", ".", "Float", "Mean GQ given genotype UNKNOWN")
  60. myvcf.header.formats.add("MGQD0", ".", "Float", "Mean GQD given genotype HOM_REF")
  61. myvcf.header.formats.add("MGQD1", ".", "Float", "Mean GQD given genotype HET")
  62. myvcf.header.formats.add("MGQD2", ".", "Float", "Mean GQD given genotype HOM_ALT")
  63. myvcf.header.formats.add("MGQD3", ".", "Float", "Mean GQD given genotype UNKNOWN")
  64.  
  65.        
  66. # create an object of new vcf file and open in to write data.
  67. vcf_out = pysam.VariantFile(outfile, 'w', header=myvcf.header)
  68.  
  69. with open(outfile, "a") as out:
  70.  
  71.     dp_means = []
  72.     gq_means = []
  73.     gqd_means = []
  74.     for cy_variant, my_variant in zip(vcf, myvcf):
  75.         dp = np.array(cy_variant.format('DP').flatten())
  76.         gq = cy_variant.format('GQ').flatten()
  77.         gqd = cy_variant.format('GQD').flatten()
  78.         a = cy_variant.gt_types
  79.         dp_one, dp_two, dp_three, dp_zero = [], [], [], []
  80.         gq_one, gq_two, gq_three, gq_zero = [], [], [], []
  81.         gqd_one, gqd_two, gqd_three, gqd_zero = [], [], [], []
  82.         for i, sample in enumerate(dp):
  83.             if math.isnan(sample):  
  84.                 continue
  85.             if a[i] == 0:
  86.                 dp_zero.append(sample)
  87.             elif a[i] == 1:  
  88.                 dp_one.append(sample)
  89.             elif a[i] == 2:  
  90.                 dp_two.append(sample)
  91.             elif a[i] == 3:  
  92.                 dp_three.append(sample)
  93.         for i, sample in enumerate(gq):
  94.             if math.isnan(sample):  
  95.                 continue
  96.             if a[i] == 0:
  97.                 gq_zero.append(sample)
  98.             elif a[i] == 1:  
  99.                 gq_one.append(sample)
  100.             elif a[i] == 2:  
  101.                 gq_two.append(sample)
  102.             elif a[i] == 3:  
  103.                 gq_three.append(sample)
  104.         for i, sample in enumerate(gqd):
  105.             if math.isnan(sample):  
  106.                 continue
  107.             if a[i] == 0:
  108.                 dp_zero.append(sample)
  109.             elif a[i] == 1:  
  110.                 dp_one.append(sample)
  111.             elif a[i] == 2:  
  112.                 dp_two.append(sample)
  113.             elif a[i] == 3:  
  114.                 dp_three.append(sample)
  115.         h = str(np.mean(dp_zero)) # this variant's mean of all dp samples with corresponding gt_type 0
  116.         i = str(np.mean(dp_one)) # this variant's mean of all dp samples with corresponding gt_type 1
  117.         j = str(np.mean(dp_two)) # etc
  118.         k = str(np.mean(dp_three))
  119.         l = str(np.mean(gq_zero))
  120.         m = str(np.mean(gq_one))
  121.         n = str(np.mean(gq_two))
  122.         o = str(np.mean(gq_three))
  123.         p = str(np.mean(gqd_zero))
  124.         q = str(np.mean(gqd_one))
  125.         r = str(np.mean(gqd_two))
  126.         s = str(np.mean(gqd_three))
  127.         my_variant.samples[sample]['MDP0'] = h
  128.         my_variant.samples[sample]['MDP1'] = i
  129.         my_variant.samples[sample]['MDP2'] = j
  130.         my_variant.samples[sample]['MDP3'] = k
  131.         my_variant.samples[sample]['MGQ0'] = l
  132.         my_variant.samples[sample]['MGQ1'] = m
  133.         my_variant.samples[sample]['MGQ2'] = n
  134.         my_variant.samples[sample]['MGQ3'] = o
  135.         my_variant.samples[sample]['MGQD0'] = p
  136.         my_variant.samples[sample]['MGQD1'] = q
  137.         my_variant.samples[sample]['MGQD2'] = r
  138.         my_variant.samples[sample]['MGQD3'] = s
  139.         out.write(str(my_variant))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement