Advertisement
Guest User

extension_debug

a guest
Mar 25th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.49 KB | None | 0 0
  1. import os
  2. import sys
  3.  
  4.  
  5. def load_tads(filename):
  6. tads = {}
  7. with open(filename) as fh:
  8. for line in fh:
  9. chrom, start, end = line.rstrip().split('\t')
  10. if chrom not in tads:
  11. tads[chrom] = [(int(start), int(end))]
  12. else:
  13. tads[chrom].append((int(start), int(end)))
  14.  
  15. return tads
  16.  
  17.  
  18. def main():
  19. sample1 = load_tads(sys.argv[1])
  20. sample2 = load_tads(sys.argv[2])
  21. sample3 = load_tads(sys.argv[3])
  22. ext = int(sys.argv[4])
  23.  
  24. # List of all chromosomes
  25. chroms = set(sample1.keys() + sample2.keys() + sample3.keys())
  26.  
  27. # Initialise variables
  28. s1_x_s2 = {c: [] for c in chroms}
  29. s1_x_s3 = {c: [] for c in chroms}
  30. s2_x_s3 = {c: [] for c in chroms}
  31. s1_x_s2_x_s3 = {c: [] for c in chroms}
  32. s1_u = {c: [] for c in chroms}
  33. s2_u = {c: [] for c in chroms}
  34. s3_u = {c: [] for c in chroms}
  35.  
  36. shared1 = {}
  37. shared2 = {}
  38. shared3 = {}
  39. for c in chroms:
  40. shared1[c] = [False] * len(sample1[c]) if c in sample1 else []
  41. shared2[c] = [False] * len(sample2[c]) if c in sample2 else []
  42. shared3[c] = [False] * len(sample3[c]) if c in sample3 else []
  43.  
  44. # For each chromosome
  45. for c in chroms:
  46. # Look for shared TADs between S1/S2, S1/S3, and S1/S2/S3
  47. if c in sample1 and c in sample2 and c in sample3:
  48. # Loop through all TADs from 1st sample (current chromosome)
  49. for i, (start1, end1) in enumerate(sample1[c]):
  50. # Loop through all TADs for 2nd sample
  51.  
  52. for j, (start2, end2) in enumerate(sample2[c]):
  53. if start2 + ext < start1 - ext:
  54. """
  55. Tad 2 starts before Tad 1 stars
  56. /
  57. __|__ tad1
  58.  
  59. /
  60. __|__ tad2
  61.  
  62. """
  63. continue
  64. elif start2 - ext > start1 + ext:
  65. """
  66. Tad 2 starts after Tad 1 starts
  67. /
  68. __|__ tad1
  69.  
  70. /
  71. __|__ tad2
  72.  
  73. """
  74. break
  75. elif end1 - ext <= end2 + ext <= end1 + ext or end1 - ext <= end2 - ext <= end1 + ext:
  76. """
  77. \
  78. __|__ tad1
  79.  
  80. \
  81. __|__ tad2
  82.  
  83. or
  84.  
  85. \
  86. __|__ tad1
  87.  
  88. \
  89. __|__ tad2
  90.  
  91. """
  92. # TAD shared between 1st/2nd samples
  93. shared1[c][i] = True
  94. shared2[c][j] = True
  95. shared_all = False # By default, consider the TADs is NOT shared between all three samples
  96.  
  97. # Finally, loop through TADs for the last sample
  98. for k, (start3, end3) in enumerate(sample3[c]):
  99. if start3 + ext < start1 - ext or start3 + ext < start2 - ext:
  100. continue
  101. elif start3 - ext > start1 + ext or start3 - ext > start2 + ext:
  102. break
  103. elif (end1 - ext <= end3 + ext <= end1 + ext or end1 - ext <= end3 - ext <= end1 + ext) and (end2 - ext <= end3 + ext <= end2 + ext or end2 - ext <= end3 - ext <= end2 + ext):
  104. # Shared between three samples
  105. shared3[c][k] = True
  106. shared_all = True
  107. s1_x_s2_x_s3[c].append((i, j, k))
  108.  
  109. """
  110. # Check for TAD overlap
  111. n = len(s1_x_s2_x_s3[c])
  112. print i, j, k
  113. if n > 1 and s1_x_s2_x_s3[c][n-2][0] == i and s1_x_s2_x_s3[c][n-2][1] == j:
  114. print '------'
  115. print c, start1, end1
  116. print c, start2, end2
  117. print c, start3, end3
  118. print c, sample1[c][i]
  119. print c, sample2[c][j]
  120. print c, sample3[c][k-1]
  121. exit(0)
  122. """
  123.  
  124.  
  125. if not shared_all:
  126. # If not shared between three samples, it's at least shared between 1st and 2d sample
  127. s1_x_s2[c].append((i, j))
  128.  
  129. if not shared1[c][i]:
  130. # TAD from 1st sample isn't shared with 2nd sample, but might be with the 3rd
  131. for k, (start3, end3) in enumerate(sample3[c]):
  132. if start3 + ext < start1 - ext:
  133. continue
  134. elif start3 - ext > start1 + ext:
  135. break
  136. elif end1 - ext <= end3 + ext <= end1 + ext or end1 - ext <= end3 - ext <= end1 + ext:
  137. # Shared between 1st and 3rd samples
  138. s1_x_s3[c].append((i, k))
  139. shared1[c][i] = True
  140. shared3[c][k] = True
  141.  
  142. # Look for shared TADs between S2/S3
  143. if c in sample2 and c in sample3:
  144. for j, (start2, end2) in enumerate(sample2[c]):
  145. if shared2[c][j]:
  146. continue
  147.  
  148. for k, (start3, end3) in enumerate(sample3[c]):
  149. if shared3[c][k]:
  150. continue
  151. elif start3 + ext < start2 - ext:
  152. continue
  153. elif start3 - ext > start2 + ext:
  154. break
  155. elif end2 - ext <= end3 + ext <= end2 + ext or end2 - ext <= end3 - ext <= end2 + ext:
  156. # Shared
  157. s2_x_s3[c].append((j, k))
  158. shared2[c][j] = True
  159. shared3[c][k] = True
  160.  
  161. # List unique (non-shared) TADs
  162. s1_u[c] = [i for i, b in enumerate(shared1[c]) if not b]
  163. s2_u[c] = [j for j, b in enumerate(shared2[c]) if not b]
  164. s3_u[c] = [k for k, b in enumerate(shared3[c]) if not b]
  165.  
  166. for i, j, k in s1_x_s2_x_s3[c]:
  167. print '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(c, sample1[c][i][0], sample1[c][i][1], sample2[c][j][0], sample2[c][j][1], sample3[c][k][0], sample3[c][k][1])
  168.  
  169. for i, j in s1_x_s2[c]:
  170. print '{}\t{}\t{}\t{}\t{}\t-\t-'.format(c, sample1[c][i][0], sample1[c][i][1], sample2[c][j][0], sample2[c][j][1])
  171.  
  172. for i, k in s1_x_s3[c]:
  173. print '{}\t{}\t{}\t-\t-\t{}\t{}'.format(c, sample1[c][i][0], sample1[c][i][1], sample3[c][k][0], sample3[c][k][1])
  174.  
  175. for j, k in s2_x_s3[c]:
  176. print '{}\t-\t-\t{}\t{}\t{}\t{}'.format(c, sample2[c][j][0], sample2[c][j][1], sample3[c][k][0], sample3[c][k][1])
  177.  
  178.  
  179. sys.stderr.write('S1 (all): {}\n'.format(sum(len(sample1[c]) for c in sample1)))
  180. sys.stderr.write('S2 (all): {}\n'.format(sum(len(sample2[c]) for c in sample2)))
  181. sys.stderr.write('S3 (all): {}\n'.format(sum(len(sample3[c]) for c in sample3)))
  182.  
  183. sys.stderr.write('S1 x S2: {}\n'.format(sum(len(s1_x_s2[c]) for c in s1_x_s2)))
  184. sys.stderr.write('S1 x S3: {}\n'.format(sum(len(s1_x_s3[c]) for c in s1_x_s3)))
  185. sys.stderr.write('S2 x S3: {}\n'.format(sum(len(s2_x_s3[c]) for c in s2_x_s3)))
  186. sys.stderr.write('S1 x S2 x S3: {}\n'.format(sum(len(s1_x_s2_x_s3[c]) for c in s1_x_s2_x_s3)))
  187.  
  188. sys.stderr.write('S1 (unique): {}\n'.format(sum(len(s1_u[c]) for c in s1_u)))
  189. sys.stderr.write('S2 (unique): {}\n'.format(sum(len(s2_u[c]) for c in s2_u)))
  190. sys.stderr.write('S3 (unique): {}\n'.format(sum(len(s3_u[c]) for c in s3_u)))
  191.  
  192.  
  193.  
  194. if __name__ == '__main__':
  195. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement