Guest User

Untitled

a guest
Jan 17th, 2018
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.77 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import argparse
  4. import khmer
  5. from math import log
  6. import re
  7. import scipy.stats
  8. import sys
  9.  
  10.  
  11. def load_smallcounttable(filename):
  12. print('Loading counttable "{:s}"...'.format(filename), end='',
  13. file=sys.stderr, flush=True)
  14. ct = khmer.SmallCounttable.load(filename)
  15. print('done!', file=sys.stderr, flush=True)
  16. return ct
  17.  
  18.  
  19. def load_counttable(filename):
  20. print('Loading counttable "{:s}"...'.format(filename), end='',
  21. file=sys.stderr, flush=True)
  22. ct = khmer.Counttable.load(filename)
  23. print('done!', file=sys.stderr, flush=True)
  24. return ct
  25.  
  26.  
  27. def load_nodetable(filename):
  28. print('Loading nodetable "{:s}"...'.format(filename), end='',
  29. file=sys.stderr, flush=True)
  30. ct = khmer.Nodetable.load(filename)
  31. print('done!', file=sys.stderr, flush=True)
  32. return ct
  33.  
  34.  
  35. def filter_refr(akmers, rkmers, refr):
  36. newa, newr = list(), list()
  37. for a in akmers:
  38. if refr.get(a) == 0:
  39. newa.append(a)
  40. for r in rkmers:
  41. if refr.get(r) < 2:
  42. newr.append(r)
  43. return newa, newr
  44.  
  45.  
  46. def get_abundances(kmers, casecounts, controlcounts):
  47. abundances = list()
  48. for _ in range(len(controlcounts) + 1):
  49. abundances.append(list())
  50.  
  51. for kmer in kmers:
  52. a = casecounts.get(kmer)
  53. abundances[0].append(a)
  54. for i in range(len(controlcounts)):
  55. a = controlcounts[i].get(kmer)
  56. abundances[i+1].append(a)
  57.  
  58. return abundances
  59.  
  60.  
  61. def set_error_rates(error, nsamples):
  62. if isinstance(error, float):
  63. errors = [error] * nsamples
  64. elif isinstance(error, list):
  65. assert len(error) == nsamples
  66. for e in error:
  67. assert isinstance(e, float)
  68. errors = error
  69. else:
  70. message = 'variable {} doesn\'t quack like a float'.format(error)
  71. message += ' or a list of floats'
  72. raise ValueError(message)
  73. return errors
  74.  
  75.  
  76. def abund_log_prob(genotype, abundance, mean=30.0, sd=8.0, error=0.01):
  77. if genotype == 0:
  78. return abundance * log(error)
  79. if genotype == 1:
  80. return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2)
  81. if genotype == 2:
  82. return scipy.stats.norm.logpdf(abundance, mean, sd)
  83.  
  84.  
  85. def likelihood_denovo(altabunds, refrabunds, mean=30.0, sd=8.0, error=0.01):
  86. errors = set_error_rates(error, nsamples=len(altabunds))
  87. logsum = 0.0
  88.  
  89. # Case
  90. for abund in altabunds[0] + refrabunds[0]:
  91. logsum += abund_log_prob(1, abund, mean=mean, sd=sd)
  92.  
  93. # Controls
  94. for alt, refr, err in zip(altabunds[1:], refrabunds[1:], errors[1:]):
  95. for a in alt:
  96. logsum += abund_log_prob(0, a, error=err)
  97. for r in refr:
  98. logsum += abund_log_prob(2, r, mean=mean, sd=sd)
  99. return logsum
  100.  
  101.  
  102. def likelihood_false(altabunds, error=0.01):
  103. errors = set_error_rates(error, nsamples=len(altabunds))
  104. logsum = 0.0
  105. for abundlist, e in zip(altabunds, errors):
  106. for abund in abundlist:
  107. logsum += abund_log_prob(0, abund, error=e)
  108. return logsum
  109.  
  110.  
  111. def likelihood_inherited(altabunds, mean=30.0, sd=8.0, error=0.01):
  112. scenarios = [
  113. (1, 0, 1), (1, 0, 2),
  114. (1, 1, 0), (1, 1, 1), (1, 1, 2),
  115. (1, 2, 0), (1, 2, 1),
  116. (2, 1, 1), (2, 1, 2),
  117. (2, 2, 1), (2, 2, 2),
  118. ]
  119. errors = set_error_rates(error, nsamples=3)
  120. logsum = 0.0
  121. abundances = zip(altabunds[0], altabunds[1], altabunds[2])
  122. for a_c, a_m, a_f in abundances:
  123. maxval = None
  124. for g_c, g_m, g_f in scenarios:
  125. testsum = abund_log_prob(g_c, a_c, mean, sd, errors[0]) + \
  126. abund_log_prob(g_m, a_m, mean, sd, errors[1]) + \
  127. abund_log_prob(g_f, a_f, mean, sd, errors[2]) + \
  128. log(1.0 / 15.0)
  129. if maxval is None or testsum > maxval:
  130. maxval = testsum
  131. logsum += maxval
  132. return log(15.0 / 11.0) * logsum # 1 / (11/15)
  133.  
  134.  
  135. def joinlist(thelist):
  136. if len(thelist) == 0:
  137. return '.'
  138. else:
  139. return ','.join([str(v) for v in thelist])
  140.  
  141.  
  142. parser = argparse.ArgumentParser()
  143. parser.add_argument('--case', type=load_counttable)
  144. parser.add_argument('--controls', type=load_counttable, nargs='+')
  145. parser.add_argument('--mu', type=float, default=40.0)
  146. parser.add_argument('--sigma', type=float, default=8.0)
  147. parser.add_argument('--epsilon', type=float, default=0.01)
  148. parser.add_argument('--out', type=argparse.FileType('w'), default=sys.stdout)
  149. parser.add_argument('--refr', type=load_smallcounttable)
  150. parser.add_argument('vcf', type=argparse.FileType('r'))
  151. args = parser.parse_args()
  152.  
  153.  
  154. for record in args.vcf:
  155. fields = record.strip().split('\t')
  156. windowmatch = re.search('VW=([^;\n]+)', record)
  157. if not windowmatch:
  158. continue
  159. window = windowmatch.group(1)
  160. if len(window) < args.case.ksize():
  161. message = 'WARNING: ignoring window "{:s}"'.format(window)
  162. message += ', shorter than ksize {:d}'.format(args.case.ksize())
  163. print(message, file=sys.stderr)
  164. continue
  165. windowmatch = re.search('RW=([^;\n]+)', record)
  166. if not windowmatch:
  167. continue
  168. refrwindow = windowmatch.group(1)
  169.  
  170. altkmers = args.case.get_kmers(window)
  171. refrkmers = args.case.get_kmers(refrwindow)
  172. dropped = 0
  173. if args.refr:
  174. nalt = len(altkmers)
  175. nrefr = len(refrkmers)
  176. altkmers, refrkmers = filter_refr(altkmers, refrkmers, args.refr)
  177. adropped = nalt - len(altkmers)
  178. rdropped = nrefr - len(refrkmers)
  179. if len(altkmers) == 0:
  180. message = 'WARNING: ignoring window "{:s}"'.format(window)
  181. message += ', all k-mers present in reference genome'
  182. print(message, file=sys.stderr)
  183. continue
  184.  
  185. altabunds = get_abundances(altkmers, args.case, args.controls)
  186. refrabunds = get_abundances(refrkmers, args.case, args.controls)
  187. aacase = joinlist(altabunds[0])
  188. racase = joinlist(refrabunds[0])
  189. aactrl1 = joinlist(altabunds[1])
  190. ractrl1 = joinlist(refrabunds[1])
  191. aactrl2 = joinlist(altabunds[2])
  192. ractrl2 = joinlist(refrabunds[2])
  193.  
  194. likedn = likelihood_denovo(altabunds, refrabunds, mean=args.mu,
  195. sd=args.sigma, error=args.epsilon)
  196. likefp = likelihood_false(altabunds, error=args.epsilon)
  197. likeih = likelihood_inherited(altabunds, mean=args.mu, sd=args.sigma,
  198. error=args.epsilon)
  199. fields[7] += ';LLDN={:.2f};LLFP={:.2f};LLIH={:.2f}'.format(likedn, likefp,
  200. likeih)
  201. fields[7] += ';DROPPED={:d},{:d}'.format(adropped, rdropped)
  202. fields[7] += ';ALTCASE={:s};REFRCASE={:s}'.format(aacase, racase)
  203. fields[7] += ';ALTCTRL1={:s};REFRCTRL1={:s}'.format(aactrl1, ractrl1)
  204. fields[7] += ';ALTCTRL2={:s};REFRCTRL2={:s}'.format(aactrl2, ractrl2)
  205. print(*fields, sep='\t', file=args.out)
Add Comment
Please, Sign In to add comment