Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.98 KB | None | 0 0
  1. import sys
  2. import os
  3. import re
  4. import argparse as ap
  5.  
  6. class Container:
  7. def __init__(self):
  8. self.members = []
  9.  
  10. def add_member(self, line):
  11. self.members.append(line)
  12.  
  13. def get_last(self):
  14. return self.members[-1]
  15.  
  16. def dump(self):
  17. rep = [m.dump() if isinstance(m, Container) else m for m in self.members]
  18. return "\n".join(rep)
  19.  
  20.  
  21. class MSScan(Container):
  22. def __init__(self):
  23. super(MSScan, self).__init__()
  24. self.scan_num = None
  25. self.filter_line = None
  26.  
  27. def finalize(self):
  28. self.scan_num = int(re.search('(?<=num\=\")[0-9]+', self.members[0]).group(0))
  29. self.filter_line = re.search('(?<=filterLine\=\").+(?=\")', self.members[5]).group(0)
  30.  
  31.  
  32. class MSRun(Container):
  33. def __init__(self):
  34. super(MSRun, self).__init__()
  35. self.scan_count = 0
  36. self.start_time = 0
  37. self.end_time = 0
  38.  
  39. def count_scans(self, match):
  40. self.scan_count += 1
  41. return str(self.scan_count)
  42.  
  43. def sort(self):
  44. key_func = lambda m: (m.filter_line, m.scan_num) if isinstance(m, MSScan) else ()
  45. tail = self.members[-1:]
  46. self.members = sorted(self.members[:-1], key=key_func)
  47. self.members += tail
  48.  
  49. def dump(self):
  50. rep = super(MSRun, self).dump()
  51. rep = re.sub('(?<=scan num\=\")[0-9]+', self.count_scans, rep)
  52. rep = re.sub('(?<=scanCount\=\")[0-9]+', str(self.scan_count), rep)
  53. return rep
  54.  
  55. class MSFile(Container):
  56. def __init__(self):
  57. super(MSFile, self).__init__()
  58.  
  59. def make_index(self):
  60. scan_search = re.compile("\<scan")
  61. self.members.append(' <index name="scan" >')
  62. for ind, match in enumerate(scan_search.finditer(self.members[0])):
  63. to_add = ' <offset id="{index}" >{off}</offset>'.format(index=ind+1, off=match.start(0))
  64. self.members.append(to_add)
  65. self.members.append(' </index>')
  66. self.members.append(' <indexOffset>{}</indexOffset>'.format(len(self.members[0]) + 2)) #+2 for newline and space
  67.  
  68. def dump(self):
  69. self.members = [super(MSFile, self).dump()]
  70. self.make_index()
  71. self.members.append("</mzXML>\n")
  72. return '\n'.join(self.members)
  73.  
  74. class FileParser:
  75. def __init__(self, split_criteria, transfer_ms2, sort_ms1, output_right):
  76. self.split_criteria = re.compile(split_criteria)
  77. self.transfer_ms2 = transfer_ms2
  78. self.sort_ms1 = sort_ms1
  79. self.output_right = output_right
  80. self.left_file = MSFile()
  81. self.right_file = MSFile()
  82. self.buff = None
  83. self.mz_file = None
  84.  
  85. # Search helpers
  86. self.isRunStart = re.compile('\<msRun')
  87. self.isRunEnd = re.compile('\<\/msRun\>')
  88.  
  89. self.isScanStart = re.compile('\<scan')
  90. self.isScanEnd = re.compile('\<\/scan\>')
  91.  
  92. self.isIndexStart = re.compile('\<index')
  93. self.isIndexEnd = re.compile('\<\/indexOffset\>')
  94.  
  95.  
  96. def parse_scan(self):
  97. scan = MSScan()
  98. scan.add_member(self.buff)
  99. self.buff = self.mz_file.pop()
  100. while self.mz_file:
  101. if self.isScanStart.search(self.buff):
  102. self.buff = self.parse_scan()
  103. scan.add_member(self.buff)
  104. elif self.isScanEnd.search(self.buff):
  105. scan.add_member(self.buff)
  106. scan.finalize()
  107. break
  108. else:
  109. scan.add_member(self.buff)
  110. self.buff = self.mz_file.pop()
  111. return scan
  112.  
  113.  
  114. def parse_ms_run(self):
  115. # This is where criteria comes in.
  116. # Right now, I think the best idea is
  117. # to build to run containers and then
  118. # append them at the end.
  119. left_run = MSRun()
  120. right_run = MSRun()
  121. while self.mz_file:
  122. if self.isScanStart.search(self.buff):
  123. self.buff = self.parse_scan()
  124. # if scan meets criteria, send to left scan
  125. # else, take all ms2 from scan and give it
  126. # to the last scan in left
  127. if self.split_criteria.search(self.buff.filter_line):
  128. left_run.add_member(self.buff)
  129. else:
  130. if self.transfer_ms2:
  131. [left_run.get_last().add_member(m) if isinstance(m, MSScan) else None
  132. for m in self.buff.members]
  133. right_run.add_member(self.buff)
  134.  
  135. elif self.isRunEnd.search(self.buff):
  136. left_run.add_member(self.buff)
  137. right_run.add_member(self.buff)
  138. break
  139. else:
  140. left_run.add_member(self.buff)
  141. right_run.add_member(self.buff)
  142. self.buff = self.mz_file.pop()
  143.  
  144. if self.sort_ms1:
  145. left_run.sort()
  146. right_run.sort()
  147.  
  148. self.left_file.add_member(left_run)
  149. self.right_file.add_member(right_run)
  150.  
  151. def make_index(self):
  152. while self.mz_file:
  153. if self.isIndexEnd.search(self.buff):
  154. break
  155. self.buff = self.mz_file.pop()
  156.  
  157. def parse_file(self, file_name):
  158. with open(file_name, mode='rb') as source:
  159. self.mz_file = source.read().decode('ISO-8859-1').split('\n')
  160. self.mz_file.reverse()
  161.  
  162. self.buff = self.mz_file.pop()
  163. while self.mz_file:
  164. if self.isRunStart.search(self.buff):
  165. self.parse_ms_run()
  166. break # Ignore everything after </msRun>
  167. else:
  168. self.left_file.add_member(self.buff)
  169. self.right_file.add_member(self.buff)
  170. self.buff = self.mz_file.pop()
  171.  
  172. def write_files(self, path, left_name, right_name):
  173.  
  174. with open(os.path.join(path, left_name), mode='wb') as dest:
  175. dest.write(self.left_file.dump().encode('ISO-8859-1'))
  176.  
  177. if self.output_right:
  178. with open(os.path.join(path, right_name), mode='wb') as dest:
  179. dest.write(self.right_file.dump().encode('ISO-8859-1'))
  180.  
  181. if __name__ == "__main__":
  182. arg_parser = ap.ArgumentParser(prog="mzXMLparser",
  183. usage="""
  184. This program is meant for the task of spliting mzXML files based on the MS1 filter line into 2 child files,
  185. named left and right. These prefixes can be changed ith the -p1 or -p2 command. The script can also sort MS1s.
  186.  
  187. python mzxml_splitter.py [args] file1 file2 ...
  188. """
  189. )
  190. arg_parser.add_argument("-t", "--transfer", action="store_true",
  191. help="Whether to transfer ms2 from non-matching MS1 into matching MS1")
  192. arg_parser.add_argument("-s", "--sort", action="store_true",
  193. help="Whether to sort the MS1 scans once they are split (sorts both files). Sorting is on filterLine first, then original scan number.")
  194. arg_parser.add_argument("-or", "--output_right", action="store_true",
  195. help="Whether to output a mzXML file with MS1s that don't match your REGEX")
  196. arg_parser.add_argument("-r", "--regex", default='',
  197. help="REGEX to split MS1 scans on. Searches the filterLine attribute of MS1 scans.")
  198. arg_parser.add_argument("-p1", "--prefix1", default='left',
  199. help="Prefix to add to file of scans which MATCH specified criteria")
  200. arg_parser.add_argument("-p2", "--prefix2", default='right',
  201. help="Prefix to add to file of scans which DON'T MATCH specified criteria")
  202. arg_parser.add_argument("-d", "--directory", default='./',
  203. help="Output directory for final files")
  204. arg_parser.add_argument('files', nargs='+')
  205. args = arg_parser.parse_args()
  206. for f in args.files:
  207. file_parser = FileParser(args.regex, args.transfer, args.sort, args.output_right)
  208. file_parser.parse_file(f)
  209. base_name = os.path.split(f)[1]
  210. file_parser.write_files(args.directory,
  211. args.prefix1 + "_" + base_name,
  212. args.prefix2 + "_" + base_name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement