Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- def load_tads(filename):
- tads = {}
- with open(filename) as fh:
- for line in fh:
- chrom, start, end = line.rstrip().split('\t')
- if chrom not in tads:
- tads[chrom] = [(int(start), int(end))]
- else:
- tads[chrom].append((int(start), int(end)))
- return tads
- def main():
- sample1 = load_tads(sys.argv[1])
- sample2 = load_tads(sys.argv[2])
- sample3 = load_tads(sys.argv[3])
- ext = int(sys.argv[4])
- # List of all chromosomes
- chroms = set(sample1.keys() + sample2.keys() + sample3.keys())
- # Initialise variables
- s1_x_s2 = {c: [] for c in chroms}
- s1_x_s3 = {c: [] for c in chroms}
- s2_x_s3 = {c: [] for c in chroms}
- s1_x_s2_x_s3 = {c: [] for c in chroms}
- s1_u = {c: [] for c in chroms}
- s2_u = {c: [] for c in chroms}
- s3_u = {c: [] for c in chroms}
- shared1 = {}
- shared2 = {}
- shared3 = {}
- for c in chroms:
- shared1[c] = [False] * len(sample1[c]) if c in sample1 else []
- shared2[c] = [False] * len(sample2[c]) if c in sample2 else []
- shared3[c] = [False] * len(sample3[c]) if c in sample3 else []
- # For each chromosome
- for c in chroms:
- # Look for shared TADs between S1/S2, S1/S3, and S1/S2/S3
- if c in sample1 and c in sample2 and c in sample3:
- # Loop through all TADs from 1st sample (current chromosome)
- for i, (start1, end1) in enumerate(sample1[c]):
- # Loop through all TADs for 2nd sample
- for j, (start2, end2) in enumerate(sample2[c]):
- if start2 + ext < start1 - ext:
- """
- Tad 2 starts before Tad 1 stars
- /
- __|__ tad1
- /
- __|__ tad2
- """
- continue
- elif start2 - ext > start1 + ext:
- """
- Tad 2 starts after Tad 1 starts
- /
- __|__ tad1
- /
- __|__ tad2
- """
- break
- elif end1 - ext <= end2 + ext <= end1 + ext or end1 - ext <= end2 - ext <= end1 + ext:
- """
- \
- __|__ tad1
- \
- __|__ tad2
- or
- \
- __|__ tad1
- \
- __|__ tad2
- """
- # TAD shared between 1st/2nd samples
- shared1[c][i] = True
- shared2[c][j] = True
- shared_all = False # By default, consider the TADs is NOT shared between all three samples
- # Finally, loop through TADs for the last sample
- for k, (start3, end3) in enumerate(sample3[c]):
- if start3 + ext < start1 - ext or start3 + ext < start2 - ext:
- continue
- elif start3 - ext > start1 + ext or start3 - ext > start2 + ext:
- break
- elif (end1 - ext <= end3 + ext <= end1 + ext or end1 - ext <= end3 - ext <= end1 + ext) and (end2 - ext <= end3 + ext <= end2 + ext or end2 - ext <= end3 - ext <= end2 + ext):
- # Shared between three samples
- shared3[c][k] = True
- shared_all = True
- s1_x_s2_x_s3[c].append((i, j, k))
- """
- # Check for TAD overlap
- n = len(s1_x_s2_x_s3[c])
- print i, j, k
- if n > 1 and s1_x_s2_x_s3[c][n-2][0] == i and s1_x_s2_x_s3[c][n-2][1] == j:
- print '------'
- print c, start1, end1
- print c, start2, end2
- print c, start3, end3
- print c, sample1[c][i]
- print c, sample2[c][j]
- print c, sample3[c][k-1]
- exit(0)
- """
- if not shared_all:
- # If not shared between three samples, it's at least shared between 1st and 2d sample
- s1_x_s2[c].append((i, j))
- if not shared1[c][i]:
- # TAD from 1st sample isn't shared with 2nd sample, but might be with the 3rd
- for k, (start3, end3) in enumerate(sample3[c]):
- if start3 + ext < start1 - ext:
- continue
- elif start3 - ext > start1 + ext:
- break
- elif end1 - ext <= end3 + ext <= end1 + ext or end1 - ext <= end3 - ext <= end1 + ext:
- # Shared between 1st and 3rd samples
- s1_x_s3[c].append((i, k))
- shared1[c][i] = True
- shared3[c][k] = True
- # Look for shared TADs between S2/S3
- if c in sample2 and c in sample3:
- for j, (start2, end2) in enumerate(sample2[c]):
- if shared2[c][j]:
- continue
- for k, (start3, end3) in enumerate(sample3[c]):
- if shared3[c][k]:
- continue
- elif start3 + ext < start2 - ext:
- continue
- elif start3 - ext > start2 + ext:
- break
- elif end2 - ext <= end3 + ext <= end2 + ext or end2 - ext <= end3 - ext <= end2 + ext:
- # Shared
- s2_x_s3[c].append((j, k))
- shared2[c][j] = True
- shared3[c][k] = True
- # List unique (non-shared) TADs
- s1_u[c] = [i for i, b in enumerate(shared1[c]) if not b]
- s2_u[c] = [j for j, b in enumerate(shared2[c]) if not b]
- s3_u[c] = [k for k, b in enumerate(shared3[c]) if not b]
- for i, j, k in s1_x_s2_x_s3[c]:
- print '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(c, sample1[c][i][0], sample1[c][i][1], sample2[c][j][0], sample2[c][j][1], sample3[c][k][0], sample3[c][k][1])
- for i, j in s1_x_s2[c]:
- print '{}\t{}\t{}\t{}\t{}\t-\t-'.format(c, sample1[c][i][0], sample1[c][i][1], sample2[c][j][0], sample2[c][j][1])
- for i, k in s1_x_s3[c]:
- print '{}\t{}\t{}\t-\t-\t{}\t{}'.format(c, sample1[c][i][0], sample1[c][i][1], sample3[c][k][0], sample3[c][k][1])
- for j, k in s2_x_s3[c]:
- print '{}\t-\t-\t{}\t{}\t{}\t{}'.format(c, sample2[c][j][0], sample2[c][j][1], sample3[c][k][0], sample3[c][k][1])
- sys.stderr.write('S1 (all): {}\n'.format(sum(len(sample1[c]) for c in sample1)))
- sys.stderr.write('S2 (all): {}\n'.format(sum(len(sample2[c]) for c in sample2)))
- sys.stderr.write('S3 (all): {}\n'.format(sum(len(sample3[c]) for c in sample3)))
- sys.stderr.write('S1 x S2: {}\n'.format(sum(len(s1_x_s2[c]) for c in s1_x_s2)))
- sys.stderr.write('S1 x S3: {}\n'.format(sum(len(s1_x_s3[c]) for c in s1_x_s3)))
- sys.stderr.write('S2 x S3: {}\n'.format(sum(len(s2_x_s3[c]) for c in s2_x_s3)))
- sys.stderr.write('S1 x S2 x S3: {}\n'.format(sum(len(s1_x_s2_x_s3[c]) for c in s1_x_s2_x_s3)))
- sys.stderr.write('S1 (unique): {}\n'.format(sum(len(s1_u[c]) for c in s1_u)))
- sys.stderr.write('S2 (unique): {}\n'.format(sum(len(s2_u[c]) for c in s2_u)))
- sys.stderr.write('S3 (unique): {}\n'.format(sum(len(s3_u[c]) for c in s3_u)))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement