Guest User

Untitled

a guest
Jun 21st, 2018
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.99 KB | None | 0 0
  1. from __future__ import division
  2. from Bio import SeqIO
  3. import pprint
  4. import glob
  5. import os
  6. import sys
  7. import itertools
  8.  
  9. class Contigs():
  10. def __init__(self, contig_handle):
  11. self.contig_handle = contig_handle
  12. self.colibs = {}
  13. self.read_in_contigs()
  14. self.calc_contig_range()
  15.  
  16. def read_in_contigs(self):
  17. self.contig_seq = {}
  18. self.contig_order = []
  19. for contig in SeqIO.parse(self.contig_handle, 'fasta'):
  20. self.contig_order.append(contig.id)
  21. self.contig_seq[contig.id] = str(contig.seq)
  22.  
  23. def calc_contig_range(self):
  24. i = 0
  25. self.contig_range = {}
  26. for contig in self.contig_order:
  27. self.contig_range[contig] = [i, i+len(self.contig_seq[contig])]
  28. z = i+len(self.contig_seq[contig])
  29. i += len(self.contig_seq[contig])
  30. total_assembly_length = sum([len(self.contig_seq[x]) for x in self.contig_seq])
  31. assert total_assembly_length == z
  32.  
  33.  
  34. class CoLiB():
  35. """docstring for CoLiB"""
  36. def __init__(self, match_1, match_2, colib_id):
  37. self.colib_id = colib_id
  38. self.match_1_cons_pos = [match_1[1], match_1[2]]
  39. self.match_2_cons_pos = [match_2[1], match_2[2]]
  40. self.match_1_strand = match_1[4]
  41. self.match_2_strand = match_2[4]
  42. self.match_1_name = None
  43. self.match_2_name = None
  44. ## set these to false for all the singleton contig/colibs which dont get checked for upstream/downstream
  45. self.match_1_upstream = False
  46. self.match_2_upstream = False
  47. self.match_1_downstream = False
  48. self.match_2_downstream = False
  49. # self.match_1_chromo_pos = []
  50. # self.match_2_chromo_pos = []
  51.  
  52. def calc_intersection(self, list1, list2):
  53. ## do it this way because looking at intersection of ranges takes ages.
  54. ## it returns a negative value if there is no intersection
  55. d1 = 0
  56. d2 = 0
  57. if max(list2) - max(list1) < 0:
  58. d1 = abs(max(list2) - max(list1))
  59. if min(list2) - min(list1) < 0:
  60. d2 = abs(min(list2) - min(list1))
  61. return (max(list1) - min(list2)) - (d1 + d2)
  62.  
  63. def assign_to_contig(self, contigs_1, contigs_2):
  64. for contig in contigs_1.contig_range:
  65. ## calculate the intersection between each contig and each colib
  66. inter = self.calc_intersection(self.match_1_cons_pos, contigs_1.contig_range[contig])
  67. ## if the intersection accounts for either 2% of the contig length or 1000bp, set the colib match name as the contig
  68. ## also, add the colib to the contigs.colib dictionary which is like
  69. ## this {'contig4':[colib1, colib2]}
  70. if inter / (contigs_1.contig_range[contig][1] - contigs_1.contig_range[contig][0]) > 0.02 or inter > 1000:
  71. #print contig, inter, (contigs_1.contig_range[contig][1] - contigs_1.contig_range[contig][0])
  72. self.match_1_name = contig
  73. if contig in contigs_1.colibs:
  74. contigs_1.colibs[contig].append(self)
  75. else:
  76. contigs_1.colibs[contig] = []
  77. contigs_1.colibs[contig].append(self)
  78. for contig in contigs_2.contig_range:
  79. inter = self.calc_intersection(self.match_2_cons_pos, contigs_2.contig_range[contig])
  80. if inter / (contigs_2.contig_range[contig][1] - contigs_2.contig_range[contig][0]) > 0.02 or inter > 1000:
  81. #print contig, inter, (contigs_2.contig_range[contig][1] - contigs_2.contig_range[contig][0])
  82. self.match_2_name = contig
  83. if contig in contigs_2.colibs:
  84. contigs_2.colibs[contig].append(self)
  85. else:
  86. contigs_2.colibs[contig] = []
  87. contigs_2.colibs[contig].append(self)
  88.  
  89. def usage():
  90. print '''Usage is:
  91.  
  92. python infer_rearrangements.py assembly_1.fa assembly_2.fa /path/to/output_dir
  93.  
  94. Assemblies should end .fa'''
  95. sys.exit()
  96.  
  97. def get_args():
  98. if len(sys.argv) != 4:
  99. usage()
  100. else:
  101. return sys.argv[1], sys.argv[2], sys.argv[3]
  102.  
  103.  
  104.  
  105. def read_in_xmfa(xmfa_handle):
  106. all_results = []
  107. with open(xmfa_handle) as fi:
  108. lines = fi.readlines()
  109. lines = [x.strip() for x in lines]
  110. lines = [x for x in lines if x.startswith('> ')]
  111. lines = [x.lstrip('> ') for x in lines]
  112. while len(lines) > 1:
  113. first = lines.pop(0).replace(':', ' ').split(' ')
  114. first = [int(first[0]), int(first[1].split('-')[0]), int(first[1].split('-')[1]), first[-1], first[2]]
  115. # print first
  116. second = lines.pop(0).replace(':', ' ').split(' ')
  117. second = [int(second[0]), int(second[1].split('-')[0]), int(second[1].split('-')[1]), second[-1], second[2]]
  118. all_results.append([first, second])
  119. paired_colibs = []
  120. ## colib is colinear block
  121. i = 1
  122. for colib in all_results:
  123. if colib[0][0] == 1 and colib[1][0] == 2:
  124. # print colib
  125. clb = CoLiB(colib[0], colib[1], i)
  126. i += 1
  127. paired_colibs.append(clb)
  128. return paired_colibs
  129.  
  130. def get_up_downstream_colib(contigs_1, contigs_2, contig_pair):
  131. '''
  132. 0. for every colib in contigs_1.colibs[contig]
  133. 1. is colib.match1 the other co-lib up or downstream of it?
  134. 2.
  135.  
  136. for colib in contigs1.colibs[contig]:
  137. what is upstream and downstream of the colib?
  138. what is upstream and downstream of the same colib on contigs_2.colibs[colib.match_2_name]?
  139. taking strand differences into account, are there different colibs either upstream or downstream of the original colib in the two strains?
  140. '''
  141. ## i think this way of doing the match 1 by iterating through contigs 1 and
  142. ## match 2 by iterating through contigs 2 works ok
  143. ## default of all upstream/downstream is false, otherwith all the up/downstream for contigs with only one clb is None and the logic breaks
  144. for i, colib in enumerate(contigs_1.colibs[contig_pair[0]]):
  145. ## if its the first one
  146. if i == 0:
  147. ## then upstream must be false (default)
  148. ## downstream is the colib one downstream of this one
  149. ## contig_pair[0] is the contig this strain is on
  150. ## contigs_1.colibs[contig_pair[0]] is the list of colibs for this contig
  151. ## i + 1 is the one downstream of this
  152. colib.match_1_downstream = contigs_1.colibs[contig_pair[0]][i + 1].colib_id
  153. elif i + 1 == len(contigs_1.colibs[contig_pair[0]]):
  154. colib.match_1_upstream = contigs_1.colibs[contig_pair[0]][i - 1].colib_id
  155. else:
  156. colib.match_1_upstream = contigs_1.colibs[contig_pair[0]][i - 1].colib_id
  157. colib.match_1_downstream = contigs_1.colibs[contig_pair[0]][i + 1].colib_id
  158.  
  159. for i, colib in enumerate(contigs_2.colibs[contig_pair[1]]):
  160. if i == 0:
  161. colib.match_2_downstream = contigs_2.colibs[contig_pair[1]][i + 1].colib_id
  162. elif i + 1 == len(contigs_2.colibs[contig_pair[1]]):
  163. colib.match_2_upstream = contigs_2.colibs[contig_pair[1]][i - 1].colib_id
  164. # if len(contigs_2.colibs[contig_pair[1]]) == 2:
  165. else:
  166. colib.match_2_upstream = contigs_2.colibs[contig_pair[1]][i - 1].colib_id
  167. colib.match_2_downstream = contigs_2.colibs[contig_pair[1]][i + 1].colib_id
  168.  
  169. def call_rearrangments(contigs_1, contigs_2, potentially_rearranged_contigs, pair):
  170. ## just doing this for contigs 1 is equivalent to doing it for both because
  171. ## the rearrangements are called in a pairwise fashion
  172. contigs_with_rearrangements = []
  173. for contig_pair in potentially_rearranged_contigs:
  174. rearrangment = False
  175. for colib in contigs_1.colibs[contig_pair[0]]:
  176. ## the strand matters, if same strain, then up should == up if no re-arrangment
  177. if colib.match_1_strand == colib.match_2_strand:
  178. ## if down != down
  179. if colib.match_1_downstream != colib.match_2_downstream:
  180. ## and if they are both not False (if one is false, means that there is no clb, so dont know about re-arrangment)
  181. if False not in (colib.match_1_downstream, colib.match_2_downstream):
  182. ## then rearranement is true
  183. rearrangment = True
  184. contigs_with_rearrangements.append(contig_pair[0])
  185. if colib.match_1_upstream != colib.match_2_upstream:
  186. if False not in (colib.match_1_upstream, colib.match_2_upstream):
  187. rearrangment = True
  188. contigs_with_rearrangements.append(contig_pair[0])
  189. ## if opposite strands, up should == down if no re-arrangement
  190. if colib.match_1_strand != colib.match_2_strand:
  191. if colib.match_1_downstream != colib.match_2_upstream:
  192. if False not in (colib.match_1_downstream, colib.match_2_upstream):
  193. rearrangment = True
  194. contigs_with_rearrangements.append(contig_pair[0])
  195. if colib.match_1_downstream != colib.match_2_upstream:
  196. if False not in (colib.match_1_downstream, colib.match_2_upstream):
  197. rearrangment = True
  198. contigs_with_rearrangements.append(contig_pair[0])
  199. print '\t'.join(map(str, [pair[0], pair[1], contig_pair[0], contig_pair[1], rearrangment]))
  200. # print
  201. # print colib, 'colib'
  202. # pprint.pprint(vars(colib))
  203. # for second_colib in contigs_2.colibs[contig_pair[1]]:
  204. # if second_colib.colib_id == colib.colib_id:
  205. # print second_colib, 'second_colib'
  206. # pprint.pprint(vars(second_colib))
  207. ## TODO - check that this number is correct
  208. print 'There are %s contigs with re-arrangements between %s' % (len(set(contigs_with_rearrangements)), pair)
  209.  
  210. def sort_colibs_within_contigs(contigs_1, contigs_2):
  211. ## sort all the colibs for both contig1 and colib.match_2_name
  212. ## even though the same instance of the colib class can be in >1 list
  213. ## not a problem, as just sorting the list, not changing the class instance itself.
  214. ## match_1_cons_pos[0]/match_2_cons_pos[0] is the start of each colib
  215. for contig in contigs_1.colibs:
  216. contigs_1.colibs[contig] = sorted(contigs_1.colibs[contig], key = lambda x:x.match_1_cons_pos[0])
  217. for contig in contigs_2.colibs:
  218. contigs_2.colibs[contig] = sorted(contigs_2.colibs[contig], key = lambda x:x.match_2_cons_pos[0])
  219. return contigs_1, contigs_2
  220.  
  221. def identify_potential_rearrangements(contigs_1, contigs_2):
  222. ## contigs_1.colibs = {contig1:[list, of, colibs]}
  223. potentially_rearranged_contigs = []
  224. for contig in contigs_1.colibs:
  225. # if contig == 'contig19':
  226. ## does the contig have more than one colib?
  227. if len(contigs_1.colibs[contig]) > 1:
  228. # tmp = [x for x in contigs_1.colibs[contig] if x.match_1_name != contig]
  229. # print contig, len(contigs_1.colibs[contig]), len(tmp)
  230. ## contigs_1.colibs[contig] is a list of all the colibs for that contig
  231. ## for each colib in that contig
  232. for colib in contigs_1.colibs[contig]:
  233. ## i dont know why colib.match_2_name can still be none here...
  234. if colib.match_2_name == None:
  235. continue
  236. ## if the contig which is matched to contigs_1 has more than one colib
  237. if len(contigs_2.colibs[colib.match_2_name]) > 1:
  238. ## print it
  239. # print contig, colib.match_2_name
  240. potentially_rearranged_contigs.append([contig, colib.match_2_name])
  241. # check_colibs_within_contigs(contigs_1, contigs_2, contig)
  242. # pprint.pprint(potentially_rearranged_contigs)
  243. return potentially_rearranged_contigs
  244.  
  245. def run_find_rearrangements(contig_1_handle, contig_2_handle, xmfa_handle, pair):
  246. ## this creates e.g. contigs_1.contig_range which = {contig1:[1,300000], ...}
  247. contigs_1 = Contigs(contig_1_handle)
  248. contigs_2 = Contigs(contig_2_handle)
  249. ## this outputs a list of co-linear blocks which are present in both samples
  250. paired_colibs = read_in_xmfa(xmfa_handle)
  251. # pprint.pprint(contigs_1.contig_range)
  252. for colib in paired_colibs:
  253. ## for each colib, assign it to a contig in each assembly
  254. colib.assign_to_contig(contigs_1, contigs_2)
  255. # print contig_1_handle, contig_2_handle
  256. ## sort the colibs by the first position in the colib. not sure that its necassary, but good to double check
  257. contigs_1, contigs_2 = sort_colibs_within_contigs(contigs_1, contigs_2)
  258. ## find pairs of contigs which are linked by a colib, where both contigs have more than one colib
  259. potentially_rearranged_contigs = identify_potential_rearrangements(contigs_1, contigs_2)
  260. for contig_pair in potentially_rearranged_contigs:
  261. ## for each colib, figure out what is upstream and downstream of it
  262. get_up_downstream_colib(contigs_1, contigs_2, contig_pair)
  263. ## check that for each colib, whether the upstream/downstream clbs are consistent with a rearrangement or not.
  264. call_rearrangments(contigs_1, contigs_2, potentially_rearranged_contigs, pair)
  265.  
  266. def get_all_pairs(todo_list):
  267. return list(itertools.combinations(todo_list, 2))
  268.  
  269. def run_mauve(contig_1_handle, contig_2_handle, output_dir):
  270. sample_name_1 = contig_1_handle.split('/')[-1].rstrip('.fa')
  271. sample_name_2 = contig_2_handle.split('/')[-1].rstrip('.fa')
  272. #os.system('progressiveMauve --output {0}/st93_{1}_vs_{2}.xmfa {3} {4}'.format(output_dir, sample_name_1, sample_name_2, contig_1_handle, contig_2_handle))
  273. return '{0}/{1}_vs_{2}.xmfa'.format(output_dir, sample_name_1, sample_name_2)
  274.  
  275. def main():
  276. contig_1_handle, contig_2_handle, output_dir = get_args()
  277. xmfa_handle = run_mauve(contig_1_handle, contig_2_handle, output_dir)
  278. run_find_rearrangements(contig_1_handle, contig_2_handle, xmfa_handle, pair)
  279.  
  280.  
  281. if __name__ == '__main__':
  282. main()
Add Comment
Please, Sign In to add comment