Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- from Bio import SeqIO
- import pprint
- import glob
- import os
- import sys
- import itertools
- class Contigs():
- def __init__(self, contig_handle):
- self.contig_handle = contig_handle
- self.colibs = {}
- self.read_in_contigs()
- self.calc_contig_range()
- def read_in_contigs(self):
- self.contig_seq = {}
- self.contig_order = []
- for contig in SeqIO.parse(self.contig_handle, 'fasta'):
- self.contig_order.append(contig.id)
- self.contig_seq[contig.id] = str(contig.seq)
- def calc_contig_range(self):
- i = 0
- self.contig_range = {}
- for contig in self.contig_order:
- self.contig_range[contig] = [i, i+len(self.contig_seq[contig])]
- z = i+len(self.contig_seq[contig])
- i += len(self.contig_seq[contig])
- total_assembly_length = sum([len(self.contig_seq[x]) for x in self.contig_seq])
- assert total_assembly_length == z
- class CoLiB():
- """docstring for CoLiB"""
- def __init__(self, match_1, match_2, colib_id):
- self.colib_id = colib_id
- self.match_1_cons_pos = [match_1[1], match_1[2]]
- self.match_2_cons_pos = [match_2[1], match_2[2]]
- self.match_1_strand = match_1[4]
- self.match_2_strand = match_2[4]
- self.match_1_name = None
- self.match_2_name = None
- ## set these to false for all the singleton contig/colibs which dont get checked for upstream/downstream
- self.match_1_upstream = False
- self.match_2_upstream = False
- self.match_1_downstream = False
- self.match_2_downstream = False
- # self.match_1_chromo_pos = []
- # self.match_2_chromo_pos = []
- def calc_intersection(self, list1, list2):
- ## do it this way because looking at intersection of ranges takes ages.
- ## it returns a negative value if there is no intersection
- d1 = 0
- d2 = 0
- if max(list2) - max(list1) < 0:
- d1 = abs(max(list2) - max(list1))
- if min(list2) - min(list1) < 0:
- d2 = abs(min(list2) - min(list1))
- return (max(list1) - min(list2)) - (d1 + d2)
- def assign_to_contig(self, contigs_1, contigs_2):
- for contig in contigs_1.contig_range:
- ## calculate the intersection between each contig and each colib
- inter = self.calc_intersection(self.match_1_cons_pos, contigs_1.contig_range[contig])
- ## if the intersection accounts for either 2% of the contig length or 1000bp, set the colib match name as the contig
- ## also, add the colib to the contigs.colib dictionary which is like
- ## this {'contig4':[colib1, colib2]}
- if inter / (contigs_1.contig_range[contig][1] - contigs_1.contig_range[contig][0]) > 0.02 or inter > 1000:
- #print contig, inter, (contigs_1.contig_range[contig][1] - contigs_1.contig_range[contig][0])
- self.match_1_name = contig
- if contig in contigs_1.colibs:
- contigs_1.colibs[contig].append(self)
- else:
- contigs_1.colibs[contig] = []
- contigs_1.colibs[contig].append(self)
- for contig in contigs_2.contig_range:
- inter = self.calc_intersection(self.match_2_cons_pos, contigs_2.contig_range[contig])
- if inter / (contigs_2.contig_range[contig][1] - contigs_2.contig_range[contig][0]) > 0.02 or inter > 1000:
- #print contig, inter, (contigs_2.contig_range[contig][1] - contigs_2.contig_range[contig][0])
- self.match_2_name = contig
- if contig in contigs_2.colibs:
- contigs_2.colibs[contig].append(self)
- else:
- contigs_2.colibs[contig] = []
- contigs_2.colibs[contig].append(self)
- def usage():
- print '''Usage is:
- python infer_rearrangements.py assembly_1.fa assembly_2.fa /path/to/output_dir
- Assemblies should end .fa'''
- sys.exit()
- def get_args():
- if len(sys.argv) != 4:
- usage()
- else:
- return sys.argv[1], sys.argv[2], sys.argv[3]
- def read_in_xmfa(xmfa_handle):
- all_results = []
- with open(xmfa_handle) as fi:
- lines = fi.readlines()
- lines = [x.strip() for x in lines]
- lines = [x for x in lines if x.startswith('> ')]
- lines = [x.lstrip('> ') for x in lines]
- while len(lines) > 1:
- first = lines.pop(0).replace(':', ' ').split(' ')
- first = [int(first[0]), int(first[1].split('-')[0]), int(first[1].split('-')[1]), first[-1], first[2]]
- # print first
- second = lines.pop(0).replace(':', ' ').split(' ')
- second = [int(second[0]), int(second[1].split('-')[0]), int(second[1].split('-')[1]), second[-1], second[2]]
- all_results.append([first, second])
- paired_colibs = []
- ## colib is colinear block
- i = 1
- for colib in all_results:
- if colib[0][0] == 1 and colib[1][0] == 2:
- # print colib
- clb = CoLiB(colib[0], colib[1], i)
- i += 1
- paired_colibs.append(clb)
- return paired_colibs
- def get_up_downstream_colib(contigs_1, contigs_2, contig_pair):
- '''
- 0. for every colib in contigs_1.colibs[contig]
- 1. is colib.match1 the other co-lib up or downstream of it?
- 2.
- for colib in contigs1.colibs[contig]:
- what is upstream and downstream of the colib?
- what is upstream and downstream of the same colib on contigs_2.colibs[colib.match_2_name]?
- taking strand differences into account, are there different colibs either upstream or downstream of the original colib in the two strains?
- '''
- ## i think this way of doing the match 1 by iterating through contigs 1 and
- ## match 2 by iterating through contigs 2 works ok
- ## default of all upstream/downstream is false, otherwith all the up/downstream for contigs with only one clb is None and the logic breaks
- for i, colib in enumerate(contigs_1.colibs[contig_pair[0]]):
- ## if its the first one
- if i == 0:
- ## then upstream must be false (default)
- ## downstream is the colib one downstream of this one
- ## contig_pair[0] is the contig this strain is on
- ## contigs_1.colibs[contig_pair[0]] is the list of colibs for this contig
- ## i + 1 is the one downstream of this
- colib.match_1_downstream = contigs_1.colibs[contig_pair[0]][i + 1].colib_id
- elif i + 1 == len(contigs_1.colibs[contig_pair[0]]):
- colib.match_1_upstream = contigs_1.colibs[contig_pair[0]][i - 1].colib_id
- else:
- colib.match_1_upstream = contigs_1.colibs[contig_pair[0]][i - 1].colib_id
- colib.match_1_downstream = contigs_1.colibs[contig_pair[0]][i + 1].colib_id
- for i, colib in enumerate(contigs_2.colibs[contig_pair[1]]):
- if i == 0:
- colib.match_2_downstream = contigs_2.colibs[contig_pair[1]][i + 1].colib_id
- elif i + 1 == len(contigs_2.colibs[contig_pair[1]]):
- colib.match_2_upstream = contigs_2.colibs[contig_pair[1]][i - 1].colib_id
- # if len(contigs_2.colibs[contig_pair[1]]) == 2:
- else:
- colib.match_2_upstream = contigs_2.colibs[contig_pair[1]][i - 1].colib_id
- colib.match_2_downstream = contigs_2.colibs[contig_pair[1]][i + 1].colib_id
- def call_rearrangments(contigs_1, contigs_2, potentially_rearranged_contigs, pair):
- ## just doing this for contigs 1 is equivalent to doing it for both because
- ## the rearrangements are called in a pairwise fashion
- contigs_with_rearrangements = []
- for contig_pair in potentially_rearranged_contigs:
- rearrangment = False
- for colib in contigs_1.colibs[contig_pair[0]]:
- ## the strand matters, if same strain, then up should == up if no re-arrangment
- if colib.match_1_strand == colib.match_2_strand:
- ## if down != down
- if colib.match_1_downstream != colib.match_2_downstream:
- ## and if they are both not False (if one is false, means that there is no clb, so dont know about re-arrangment)
- if False not in (colib.match_1_downstream, colib.match_2_downstream):
- ## then rearranement is true
- rearrangment = True
- contigs_with_rearrangements.append(contig_pair[0])
- if colib.match_1_upstream != colib.match_2_upstream:
- if False not in (colib.match_1_upstream, colib.match_2_upstream):
- rearrangment = True
- contigs_with_rearrangements.append(contig_pair[0])
- ## if opposite strands, up should == down if no re-arrangement
- if colib.match_1_strand != colib.match_2_strand:
- if colib.match_1_downstream != colib.match_2_upstream:
- if False not in (colib.match_1_downstream, colib.match_2_upstream):
- rearrangment = True
- contigs_with_rearrangements.append(contig_pair[0])
- if colib.match_1_downstream != colib.match_2_upstream:
- if False not in (colib.match_1_downstream, colib.match_2_upstream):
- rearrangment = True
- contigs_with_rearrangements.append(contig_pair[0])
- print '\t'.join(map(str, [pair[0], pair[1], contig_pair[0], contig_pair[1], rearrangment]))
- # print
- # print colib, 'colib'
- # pprint.pprint(vars(colib))
- # for second_colib in contigs_2.colibs[contig_pair[1]]:
- # if second_colib.colib_id == colib.colib_id:
- # print second_colib, 'second_colib'
- # pprint.pprint(vars(second_colib))
- ## TODO - check that this number is correct
- print 'There are %s contigs with re-arrangements between %s' % (len(set(contigs_with_rearrangements)), pair)
- def sort_colibs_within_contigs(contigs_1, contigs_2):
- ## sort all the colibs for both contig1 and colib.match_2_name
- ## even though the same instance of the colib class can be in >1 list
- ## not a problem, as just sorting the list, not changing the class instance itself.
- ## match_1_cons_pos[0]/match_2_cons_pos[0] is the start of each colib
- for contig in contigs_1.colibs:
- contigs_1.colibs[contig] = sorted(contigs_1.colibs[contig], key = lambda x:x.match_1_cons_pos[0])
- for contig in contigs_2.colibs:
- contigs_2.colibs[contig] = sorted(contigs_2.colibs[contig], key = lambda x:x.match_2_cons_pos[0])
- return contigs_1, contigs_2
- def identify_potential_rearrangements(contigs_1, contigs_2):
- ## contigs_1.colibs = {contig1:[list, of, colibs]}
- potentially_rearranged_contigs = []
- for contig in contigs_1.colibs:
- # if contig == 'contig19':
- ## does the contig have more than one colib?
- if len(contigs_1.colibs[contig]) > 1:
- # tmp = [x for x in contigs_1.colibs[contig] if x.match_1_name != contig]
- # print contig, len(contigs_1.colibs[contig]), len(tmp)
- ## contigs_1.colibs[contig] is a list of all the colibs for that contig
- ## for each colib in that contig
- for colib in contigs_1.colibs[contig]:
- ## i dont know why colib.match_2_name can still be none here...
- if colib.match_2_name == None:
- continue
- ## if the contig which is matched to contigs_1 has more than one colib
- if len(contigs_2.colibs[colib.match_2_name]) > 1:
- ## print it
- # print contig, colib.match_2_name
- potentially_rearranged_contigs.append([contig, colib.match_2_name])
- # check_colibs_within_contigs(contigs_1, contigs_2, contig)
- # pprint.pprint(potentially_rearranged_contigs)
- return potentially_rearranged_contigs
- def run_find_rearrangements(contig_1_handle, contig_2_handle, xmfa_handle, pair):
- ## this creates e.g. contigs_1.contig_range which = {contig1:[1,300000], ...}
- contigs_1 = Contigs(contig_1_handle)
- contigs_2 = Contigs(contig_2_handle)
- ## this outputs a list of co-linear blocks which are present in both samples
- paired_colibs = read_in_xmfa(xmfa_handle)
- # pprint.pprint(contigs_1.contig_range)
- for colib in paired_colibs:
- ## for each colib, assign it to a contig in each assembly
- colib.assign_to_contig(contigs_1, contigs_2)
- # print contig_1_handle, contig_2_handle
- ## sort the colibs by the first position in the colib. not sure that its necassary, but good to double check
- contigs_1, contigs_2 = sort_colibs_within_contigs(contigs_1, contigs_2)
- ## find pairs of contigs which are linked by a colib, where both contigs have more than one colib
- potentially_rearranged_contigs = identify_potential_rearrangements(contigs_1, contigs_2)
- for contig_pair in potentially_rearranged_contigs:
- ## for each colib, figure out what is upstream and downstream of it
- get_up_downstream_colib(contigs_1, contigs_2, contig_pair)
- ## check that for each colib, whether the upstream/downstream clbs are consistent with a rearrangement or not.
- call_rearrangments(contigs_1, contigs_2, potentially_rearranged_contigs, pair)
- def get_all_pairs(todo_list):
- return list(itertools.combinations(todo_list, 2))
- def run_mauve(contig_1_handle, contig_2_handle, output_dir):
- sample_name_1 = contig_1_handle.split('/')[-1].rstrip('.fa')
- sample_name_2 = contig_2_handle.split('/')[-1].rstrip('.fa')
- #os.system('progressiveMauve --output {0}/st93_{1}_vs_{2}.xmfa {3} {4}'.format(output_dir, sample_name_1, sample_name_2, contig_1_handle, contig_2_handle))
- return '{0}/{1}_vs_{2}.xmfa'.format(output_dir, sample_name_1, sample_name_2)
- def main():
- contig_1_handle, contig_2_handle, output_dir = get_args()
- xmfa_handle = run_mauve(contig_1_handle, contig_2_handle, output_dir)
- run_find_rearrangements(contig_1_handle, contig_2_handle, xmfa_handle, pair)
- if __name__ == '__main__':
- main()
Add Comment
Please, Sign In to add comment