Guest User

Untitled

a guest
Dec 14th, 2017
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.19 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import argparse
  4. from cyvcf2 import VCF, Writer
  5.  
  6.  
  7. def __main__():
  8. parser = argparse.ArgumentParser(description='Pulls depth and allelic support from sample columns')
  9. parser.add_argument('input_vcf', help='VCF query with sample calls')
  10. parser.add_argument('control_sample_id', help='Sample ID for the control sample, as found in the VCF')
  11. parser.add_argument('tumor_sample_id', help='Sample ID for the tumor sample, as found in the VCF')
  12. args = parser.parse_args()
  13.  
  14. pull_genotype_data(args.input_vcf, args.control_sample_id, args.tumor_sample_id)
  15.  
  16.  
  17. def pull_genotype_data(query_vcf, control_sample_id, tumor_sample_id):
  18. vcf = VCF(query_vcf, gts012=True)
  19.  
  20. vcf.add_info_to_header(
  21. {'ID': 'CVAF', 'Description': 'Allelic fraction of alternate allele in control sample', 'Type': 'Float',
  22. 'Number': '1'})
  23. vcf.add_info_to_header(
  24. {'ID': 'TVAF', 'Description': 'Allelic fraction of alternate allele in tumor sample', 'Type': 'Float',
  25. 'Number': '1'})
  26. vcf.add_info_to_header(
  27. {'ID': 'CDP', 'Description': 'Depth at variant site in control sample', 'Type': 'Integer', 'Number': '1'})
  28. vcf.add_info_to_header(
  29. {'ID': 'TDP', 'Description': 'Depth at variant site in tumor sample', 'Type': 'Integer', 'Number': '1'})
  30.  
  31. out_vcf = 'test.pcgr_prepped.vcf'
  32. w = Writer(out_vcf, vcf)
  33.  
  34.  
  35. ad_colms = set(["AD"]) #contains raw counts
  36. ad_freq_colms = set(["AF", "FREQ", "FA"]) #contains actual frequencies which can be used directly
  37.  
  38. dp_tag = 'DP'
  39.  
  40. try:
  41. control_index=vcf.samples.index(control_sample_id)
  42. tumor_index = vcf.samples.index(tumor_sample_id)
  43. except ValueError as err:
  44. print("Sample not found in vcf")
  45. raise
  46.  
  47. ## Go through each record
  48. for rec in vcf:
  49. allelic_support = {}
  50. for v in ['TDP', 'CDP', 'CVAF', 'TVAF']:
  51. allelic_support[v] = -1
  52. rec.INFO[v] = '.'
  53. control_alt_support = -1
  54. tumor_alt_support = -1
  55.  
  56. ad_tag=ad_freq_colms.intersection(rec.FORMAT)
  57. if(len(ad_tag)>0):
  58. ad_tag=ad_tag.pop()
  59. ad_tag_is_frequency=True
  60. else:
  61. ad_tag = ad_colms.intersection(rec.FORMAT).pop()
  62. ad_tag_is_frequency = False
  63.  
  64. sample_dat_ao = rec.format(ad_tag)
  65. if not sample_dat_ao is None:
  66. sample_dim = sample_dat_ao.shape[0]
  67. if sample_dim >= 2:
  68. control_alt_support = str(sample_dat_ao[control_index][0])
  69. if control_alt_support == '-2147483648':
  70. control_alt_support = -1
  71. tumor_alt_support = str(sample_dat_ao[tumor_index][0])
  72. if tumor_alt_support == '-2147483648':
  73. tumor_alt_support = -1
  74.  
  75.  
  76. sample_dat_dp = rec.format(dp_tag)
  77. if not sample_dat_dp is None:
  78. sample_dim = sample_dat_dp.shape[0]
  79. if sample_dim >= 2:
  80. allelic_support['CDP'] = str(sample_dat_dp[control_index][0])
  81. if allelic_support['CDP'] == '-2147483648':
  82. allelic_support['CDP'] = -1
  83. allelic_support['TDP'] = str(sample_dat_dp[tumor_index][0])
  84. if allelic_support['TDP'] == '-2147483648':
  85. allelic_support['TDP'] = -1
  86.  
  87.  
  88. if(ad_tag_is_frequency):
  89. allelic_support['TVAF'] = tumor_alt_support
  90. elif tumor_alt_support != -1 and allelic_support['TDP'] != -1:
  91. allelic_support['TVAF'] = "{0:.4f}".format(int(tumor_alt_support) / float(allelic_support['TDP']))
  92.  
  93. if (ad_tag_is_frequency):
  94. allelic_support['CVAF']= control_alt_support
  95. elif control_alt_support != -1 and allelic_support['CDP'] != -1:
  96. if(float(allelic_support['CDP'])==0): #prevent divide by zero errors
  97. allelic_support['CVAF'] = 0.0
  98. else:
  99. allelic_support['CVAF'] = "{0:.4f}".format(int(control_alt_support) / float(allelic_support['CDP']))
  100.  
  101. for v in ['TDP', 'CDP', 'CVAF', 'TVAF']:
  102. if allelic_support[v] != -1:
  103. rec.INFO[v] = allelic_support[v]
  104.  
  105. w.write_record(rec)
  106.  
  107.  
  108. w.close()
  109. vcf.close()
  110.  
  111. return
  112.  
  113.  
  114. if __name__ == "__main__": __main__()
Add Comment
Please, Sign In to add comment