Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import argparse
- from cyvcf2 import VCF, Writer
- def __main__():
- parser = argparse.ArgumentParser(description='Pulls depth and allelic support from sample columns')
- parser.add_argument('input_vcf', help='VCF query with sample calls')
- parser.add_argument('control_sample_id', help='Sample ID for the control sample, as found in the VCF')
- parser.add_argument('tumor_sample_id', help='Sample ID for the tumor sample, as found in the VCF')
- args = parser.parse_args()
- pull_genotype_data(args.input_vcf, args.control_sample_id, args.tumor_sample_id)
- def pull_genotype_data(query_vcf, control_sample_id, tumor_sample_id):
- vcf = VCF(query_vcf, gts012=True)
- vcf.add_info_to_header(
- {'ID': 'CVAF', 'Description': 'Allelic fraction of alternate allele in control sample', 'Type': 'Float',
- 'Number': '1'})
- vcf.add_info_to_header(
- {'ID': 'TVAF', 'Description': 'Allelic fraction of alternate allele in tumor sample', 'Type': 'Float',
- 'Number': '1'})
- vcf.add_info_to_header(
- {'ID': 'CDP', 'Description': 'Depth at variant site in control sample', 'Type': 'Integer', 'Number': '1'})
- vcf.add_info_to_header(
- {'ID': 'TDP', 'Description': 'Depth at variant site in tumor sample', 'Type': 'Integer', 'Number': '1'})
- out_vcf = 'test.pcgr_prepped.vcf'
- w = Writer(out_vcf, vcf)
- ad_colms = set(["AD"]) #contains raw counts
- ad_freq_colms = set(["AF", "FREQ", "FA"]) #contains actual frequencies which can be used directly
- dp_tag = 'DP'
- try:
- control_index=vcf.samples.index(control_sample_id)
- tumor_index = vcf.samples.index(tumor_sample_id)
- except ValueError as err:
- print("Sample not found in vcf")
- raise
- ## Go through each record
- for rec in vcf:
- allelic_support = {}
- for v in ['TDP', 'CDP', 'CVAF', 'TVAF']:
- allelic_support[v] = -1
- rec.INFO[v] = '.'
- control_alt_support = -1
- tumor_alt_support = -1
- ad_tag=ad_freq_colms.intersection(rec.FORMAT)
- if(len(ad_tag)>0):
- ad_tag=ad_tag.pop()
- ad_tag_is_frequency=True
- else:
- ad_tag = ad_colms.intersection(rec.FORMAT).pop()
- ad_tag_is_frequency = False
- sample_dat_ao = rec.format(ad_tag)
- if not sample_dat_ao is None:
- sample_dim = sample_dat_ao.shape[0]
- if sample_dim >= 2:
- control_alt_support = str(sample_dat_ao[control_index][0])
- if control_alt_support == '-2147483648':
- control_alt_support = -1
- tumor_alt_support = str(sample_dat_ao[tumor_index][0])
- if tumor_alt_support == '-2147483648':
- tumor_alt_support = -1
- sample_dat_dp = rec.format(dp_tag)
- if not sample_dat_dp is None:
- sample_dim = sample_dat_dp.shape[0]
- if sample_dim >= 2:
- allelic_support['CDP'] = str(sample_dat_dp[control_index][0])
- if allelic_support['CDP'] == '-2147483648':
- allelic_support['CDP'] = -1
- allelic_support['TDP'] = str(sample_dat_dp[tumor_index][0])
- if allelic_support['TDP'] == '-2147483648':
- allelic_support['TDP'] = -1
- if(ad_tag_is_frequency):
- allelic_support['TVAF'] = tumor_alt_support
- elif tumor_alt_support != -1 and allelic_support['TDP'] != -1:
- allelic_support['TVAF'] = "{0:.4f}".format(int(tumor_alt_support) / float(allelic_support['TDP']))
- if (ad_tag_is_frequency):
- allelic_support['CVAF']= control_alt_support
- elif control_alt_support != -1 and allelic_support['CDP'] != -1:
- if(float(allelic_support['CDP'])==0): #prevent divide by zero errors
- allelic_support['CVAF'] = 0.0
- else:
- allelic_support['CVAF'] = "{0:.4f}".format(int(control_alt_support) / float(allelic_support['CDP']))
- for v in ['TDP', 'CDP', 'CVAF', 'TVAF']:
- if allelic_support[v] != -1:
- rec.INFO[v] = allelic_support[v]
- w.write_record(rec)
- w.close()
- vcf.close()
- return
- if __name__ == "__main__": __main__()
Add Comment
Please, Sign In to add comment