Guest User

Untitled

a guest
Oct 16th, 2018
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.98 KB | None | 0 0
  1. #
  2. # Code to parse the interesting bits of a PAML results file that contains
  3. # statistics for several trees. Note, this approach contains some hacks that
  4. # are probably specific to multiple-tree CODEML files, Biopython has a module
  5. # for handling other PAML outputs, and it probably a better starting point for
  6. # 'normal' (single tree) input files.
  7. #
  8. # TODO - should clean up handlng of resifues - use @property decorator and
  9. # ._functions to simplify writing/reading.
  10. #
  11.  
  12.  
  13. import sys
  14. import re
  15. from collections import defaultdict
  16. from scipy.stats import chisqprob
  17.  
  18. class PAMLResult(object):
  19. """Represent the results from one PAMl model for one tree """
  20.  
  21. def __init__(self, lnL, residues =None):
  22. self.lnL = lnL
  23. self.residues = residues
  24.  
  25. def __repr__(self):
  26. return 'PAMLResult(lnL= {0})'.format(self.lnL)
  27.  
  28.  
  29. class PAMLTree(dict):
  30. """Hold PAML restuls for one tree across several models"""
  31.  
  32. def __init__(self, *args):
  33. dict.__init__(self, args)
  34.  
  35.  
  36. def _add_result(self, model, res):
  37. self[model] = res
  38.  
  39.  
  40. def _calculate_LRTs(self):
  41. """Run likelihood ratio test if there are enough results """
  42. if all( [m in self.keys() for m in [1,2]] ):
  43. D = -2 * self[1].lnL + 2 * self[2].lnL
  44. pval = chisqprob(D,2)
  45. self.LRT_m1m2 = (D, pval)
  46.  
  47. if all( [m in self.keys() for m in [7,8]] ):
  48. D = -2 * self[7].lnL + 2 * self[8].lnL
  49. pval = chisqprob(D,2)
  50. self.LRT_m7m8 = (D, pval)
  51.  
  52.  
  53. class PAMLParser(object):
  54. """Parse an entire CODEML result file
  55.  
  56. Collects data from all models and trees analysed and runs LRTs for model
  57. comparions as well as collating residues found to be under selection using
  58. Bayes Emperical Bayes.
  59.  
  60. Usage
  61.  
  62. result_file = PAMLParser("my_run.out")
  63. result_file.write("my_run", LRT=True, residues=False)
  64.  
  65. will write files:
  66. 'my_run_LRTs.csv', 'my_run_res_m2.csv' and 'my_run_resm8.csv'
  67. """
  68.  
  69. def __init__(self, fname):
  70. self.fname = fname
  71. self.read()
  72.  
  73.  
  74. def _get_residues(self, model):
  75. """ """
  76. d = defaultdict(list)
  77. for tree, mod in self.combined.items():
  78. for line in mod[model].residues:
  79. d[line[0]].append(line[2])
  80. return d
  81.  
  82.  
  83. def read(self):
  84. self.combined = defaultdict(PAMLTree)
  85. lnL_pattern = re.compile('np:\s+\d+\):\s+(-\d+\.\d+)')
  86.  
  87. mods = open(self.fname).read().split("Model ")[1:]
  88. for m in mods:
  89. m_name = int(m.split(':', 1)[0])
  90. trees = m.split('TREE ')[1:]
  91. for t in trees:
  92. t_name = int(t.split(':')[0][-1])
  93. ln_match = lnL_pattern.search(t)
  94. t_ln = float(ln_match.group(1))
  95. if "Bayes Empirical Bayes" in t:
  96. beb = t.split("Bayes Empirical Bayes (BEB) analysis")[1]
  97. table = beb.split("SE for w")[1].split('The grid')[0]
  98. #index, default, posterior w _ SE for residues
  99. residues = [tuple(l.split()) for l in table.split("\n") if l]
  100. else:
  101. residues = None
  102. res = PAMLResult(lnL=t_ln, residues=residues)
  103. self.combined[t_name]._add_result(m_name, res)
  104.  
  105. #all done, if there are any LRT-tests to run we can do them now
  106. [p._calculate_LRTs() for p in self.combined.values()]
  107.  
  108. def write(self, file_stem, LRT = True, residues= True):
  109. """ """
  110. if LRT:
  111. out = open(file_stem + "_LRTs.csv", "w")
  112. out.write("tree,comp,D, p-val\n")
  113. t = 0
  114. for tree, results in self.combined.items():
  115. try:
  116. D, pval = results.LRT_m1m2
  117. out.write("{0}, m1am2a, {1}, {2}\n".format(
  118. tree,round(D,3), pval))
  119. t += 1
  120. except(AttributeError):
  121. pass #no results for that comparisom
  122.  
  123. try:
  124. D, pval = results.LRT_m7m8
  125. out.write("{0}, m7m8, {1}, {2}\n".format(
  126. tree,round(D,3), pval))
  127. t += 1
  128. except(AttributeError):
  129. pass #no results for that comparisom
  130.  
  131. out.close()
  132. print "wrote data for {0} LRTs".format(t)
  133.  
  134. if residues:
  135.  
  136. try:
  137. m8 = self._get_residues(8)
  138. out = open(file_stem +"_res_m8.csv","w")
  139. header = "res, " + ",".join(
  140. ["t" + str(i) for i in range(1, len(m8.values()[1])+1)]
  141. )
  142. out.write(header + "\n")
  143. counter = 0
  144. for res, vals in m8.items():
  145. line = "{0},{1}\n".format(
  146. res, ",".join(f.strip("*") for f in vals))
  147. out.write(line)
  148. counter += 1
  149. print "wrote data for {0} residues using model 8".format(counter)
  150.  
  151. except KeyError:
  152. pass
  153.  
  154. try:
  155. m2 = self._get_residues(2)
  156. out = open(file_stem +"_res_m2.csv","w")
  157. header = "res, " + ",".join(
  158. ["t" + str(i) for i in range(1, len(m2.values()[1])+1)]
  159. )
  160. out.write(header + "\n")
  161. counter = 0
  162. for res, vals in m2.items():
  163. line = "{0},{1}\n".format(
  164. res, ",".join(f.strip("*") for f in vals))
  165. out.write(line)
  166. counter += 1
  167. print "wrote data for {0} residues using model 2a".format(counter)
  168.  
  169. except KeyError:
  170. pass #not finised/not results for this model
  171.  
  172.  
  173. def main():
  174. """Get the information form one file, given as a commandline argument """
  175. result = PAMLParser(sys.argv[1])
  176. result.write(sys.argv[1] + '_result')
  177.  
  178. if __name__ == "__main__":
  179. main()
Add Comment
Please, Sign In to add comment