Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import argparse
- import khmer
- from math import log
- import re
- import scipy.stats
- import sys
- def load_smallcounttable(filename):
- print('Loading counttable "{:s}"...'.format(filename), end='',
- file=sys.stderr, flush=True)
- ct = khmer.SmallCounttable.load(filename)
- print('done!', file=sys.stderr, flush=True)
- return ct
- def load_counttable(filename):
- print('Loading counttable "{:s}"...'.format(filename), end='',
- file=sys.stderr, flush=True)
- ct = khmer.Counttable.load(filename)
- print('done!', file=sys.stderr, flush=True)
- return ct
- def load_nodetable(filename):
- print('Loading nodetable "{:s}"...'.format(filename), end='',
- file=sys.stderr, flush=True)
- ct = khmer.Nodetable.load(filename)
- print('done!', file=sys.stderr, flush=True)
- return ct
- def filter_refr(akmers, rkmers, refr):
- newa, newr = list(), list()
- for a in akmers:
- if refr.get(a) == 0:
- newa.append(a)
- for r in rkmers:
- if refr.get(r) < 2:
- newr.append(r)
- return newa, newr
- def get_abundances(kmers, casecounts, controlcounts):
- abundances = list()
- for _ in range(len(controlcounts) + 1):
- abundances.append(list())
- for kmer in kmers:
- a = casecounts.get(kmer)
- abundances[0].append(a)
- for i in range(len(controlcounts)):
- a = controlcounts[i].get(kmer)
- abundances[i+1].append(a)
- return abundances
- def set_error_rates(error, nsamples):
- if isinstance(error, float):
- errors = [error] * nsamples
- elif isinstance(error, list):
- assert len(error) == nsamples
- for e in error:
- assert isinstance(e, float)
- errors = error
- else:
- message = 'variable {} doesn\'t quack like a float'.format(error)
- message += ' or a list of floats'
- raise ValueError(message)
- return errors
- def abund_log_prob(genotype, abundance, mean=30.0, sd=8.0, error=0.01):
- if genotype == 0:
- return abundance * log(error)
- if genotype == 1:
- return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2)
- if genotype == 2:
- return scipy.stats.norm.logpdf(abundance, mean, sd)
- def likelihood_denovo(altabunds, refrabunds, mean=30.0, sd=8.0, error=0.01):
- errors = set_error_rates(error, nsamples=len(altabunds))
- logsum = 0.0
- # Case
- for abund in altabunds[0] + refrabunds[0]:
- logsum += abund_log_prob(1, abund, mean=mean, sd=sd)
- # Controls
- for alt, refr, err in zip(altabunds[1:], refrabunds[1:], errors[1:]):
- for a in alt:
- logsum += abund_log_prob(0, a, error=err)
- for r in refr:
- logsum += abund_log_prob(2, r, mean=mean, sd=sd)
- return logsum
- def likelihood_false(altabunds, error=0.01):
- errors = set_error_rates(error, nsamples=len(altabunds))
- logsum = 0.0
- for abundlist, e in zip(altabunds, errors):
- for abund in abundlist:
- logsum += abund_log_prob(0, abund, error=e)
- return logsum
- def likelihood_inherited(altabunds, mean=30.0, sd=8.0, error=0.01):
- scenarios = [
- (1, 0, 1), (1, 0, 2),
- (1, 1, 0), (1, 1, 1), (1, 1, 2),
- (1, 2, 0), (1, 2, 1),
- (2, 1, 1), (2, 1, 2),
- (2, 2, 1), (2, 2, 2),
- ]
- errors = set_error_rates(error, nsamples=3)
- logsum = 0.0
- abundances = zip(altabunds[0], altabunds[1], altabunds[2])
- for a_c, a_m, a_f in abundances:
- maxval = None
- for g_c, g_m, g_f in scenarios:
- testsum = abund_log_prob(g_c, a_c, mean, sd, errors[0]) + \
- abund_log_prob(g_m, a_m, mean, sd, errors[1]) + \
- abund_log_prob(g_f, a_f, mean, sd, errors[2]) + \
- log(1.0 / 15.0)
- if maxval is None or testsum > maxval:
- maxval = testsum
- logsum += maxval
- return log(15.0 / 11.0) * logsum # 1 / (11/15)
- def joinlist(thelist):
- if len(thelist) == 0:
- return '.'
- else:
- return ','.join([str(v) for v in thelist])
- parser = argparse.ArgumentParser()
- parser.add_argument('--case', type=load_counttable)
- parser.add_argument('--controls', type=load_counttable, nargs='+')
- parser.add_argument('--mu', type=float, default=40.0)
- parser.add_argument('--sigma', type=float, default=8.0)
- parser.add_argument('--epsilon', type=float, default=0.01)
- parser.add_argument('--out', type=argparse.FileType('w'), default=sys.stdout)
- parser.add_argument('--refr', type=load_smallcounttable)
- parser.add_argument('vcf', type=argparse.FileType('r'))
- args = parser.parse_args()
- for record in args.vcf:
- fields = record.strip().split('\t')
- windowmatch = re.search('VW=([^;\n]+)', record)
- if not windowmatch:
- continue
- window = windowmatch.group(1)
- if len(window) < args.case.ksize():
- message = 'WARNING: ignoring window "{:s}"'.format(window)
- message += ', shorter than ksize {:d}'.format(args.case.ksize())
- print(message, file=sys.stderr)
- continue
- windowmatch = re.search('RW=([^;\n]+)', record)
- if not windowmatch:
- continue
- refrwindow = windowmatch.group(1)
- altkmers = args.case.get_kmers(window)
- refrkmers = args.case.get_kmers(refrwindow)
- dropped = 0
- if args.refr:
- nalt = len(altkmers)
- nrefr = len(refrkmers)
- altkmers, refrkmers = filter_refr(altkmers, refrkmers, args.refr)
- adropped = nalt - len(altkmers)
- rdropped = nrefr - len(refrkmers)
- if len(altkmers) == 0:
- message = 'WARNING: ignoring window "{:s}"'.format(window)
- message += ', all k-mers present in reference genome'
- print(message, file=sys.stderr)
- continue
- altabunds = get_abundances(altkmers, args.case, args.controls)
- refrabunds = get_abundances(refrkmers, args.case, args.controls)
- aacase = joinlist(altabunds[0])
- racase = joinlist(refrabunds[0])
- aactrl1 = joinlist(altabunds[1])
- ractrl1 = joinlist(refrabunds[1])
- aactrl2 = joinlist(altabunds[2])
- ractrl2 = joinlist(refrabunds[2])
- likedn = likelihood_denovo(altabunds, refrabunds, mean=args.mu,
- sd=args.sigma, error=args.epsilon)
- likefp = likelihood_false(altabunds, error=args.epsilon)
- likeih = likelihood_inherited(altabunds, mean=args.mu, sd=args.sigma,
- error=args.epsilon)
- fields[7] += ';LLDN={:.2f};LLFP={:.2f};LLIH={:.2f}'.format(likedn, likefp,
- likeih)
- fields[7] += ';DROPPED={:d},{:d}'.format(adropped, rdropped)
- fields[7] += ';ALTCASE={:s};REFRCASE={:s}'.format(aacase, racase)
- fields[7] += ';ALTCTRL1={:s};REFRCTRL1={:s}'.format(aactrl1, ractrl1)
- fields[7] += ';ALTCTRL2={:s};REFRCTRL2={:s}'.format(aactrl2, ractrl2)
- print(*fields, sep='\t', file=args.out)
Add Comment
Please, Sign In to add comment