Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jul 6th, 2012  |  syntax: None  |  size: 3.13 KB  |  hits: 11  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. combining values for a large number of overlapping intervals of dictionary keys
  2. all={
  3.     1:{ ('a',123,145):20, ('a',155,170):12, ('b',234,345): 34},
  4.     2:{ ('a',121,135):10, ('a',155,175):28, ('b',230,345): 16},
  5.     3:{ ('a',130,140):20, ('a',150,170):10, ('b',234,345): 30},
  6.     ...
  7.     n: {...}
  8. }
  9.        
  10. { ('a',121,122):10, ('a',123,130):30, ('a',131,135):50,
  11.    ('a',136,140):40,('a',141,145):20, ...}
  12.        
  13. all={
  14.     1:{ ('a',123,145):20, ('a',155,170):12, ('b',234,345): 34},
  15.     2:{ ('a',121,135):10, ('a',155,175):28, ('b',230,345): 16},
  16.     3:{ ('a',130,140):20, ('a',150,170):10, ('b',234,345): 30}
  17. }
  18.  
  19. from collections import defaultdict
  20. summer = defaultdict(int)
  21. mini, maxi = 0,0
  22. for d in all.values():
  23.     for (name, start, stop), value in d.iteritems():
  24.         # im completely ignoring the `name` here, not sure if that's what you want
  25.         # else just separate the data before doing this ...
  26.         if mini == 0:
  27.             mini = start
  28.         mini, maxi = min(mini, start), max(maxi, stop)
  29.         for i in range(start, stop+1):
  30.             summer[i]+=value
  31.  
  32. # now we have the values at each point, very redundant but very fast so  far
  33. print summer
  34.  
  35. # now we can find the intervals:
  36. def get_intervals(points, start, stop):
  37.     cstart = start
  38.     for i in range(start, stop+1):
  39.         if points[cstart] != points[i]: # did the value change ?
  40.             yield cstart, i-1, points[cstart]
  41.             cstart = i
  42.  
  43.     if cstart != i:
  44.         yield cstart, i, points[cstart]
  45.  
  46.  
  47. print list(get_intervals(summer, mini, maxi))
  48.        
  49. [(121, 122, 10), (123, 129, 30), (130, 135, 50), (136, 140, 40), (141, 145, 20), (146, 149, 0), (150, 154, 10), (155, 170, 50), (171, 175, 28)]
  50.        
  51. from collections import defaultdict
  52. from heapq import heappush, heappop
  53.  
  54. class Summer(object):
  55.     def __init__(self):
  56.         # its a priority queue, kind of like a sorted list
  57.         self.hq = []
  58.  
  59.     def additem(self, start, stop, value):
  60.         # at `start` add it as a positive value
  61.         heappush(self.hq, (start, value))
  62.         # at `stop` subtract that value again
  63.         heappush(self.hq, (stop, -value))
  64.  
  65.     def intervals(self):
  66.         hq = self.hq
  67.         start, val = heappop(hq)
  68.         while hq:
  69.             point, value = heappop(hq)
  70.             yield start, point, val
  71.             # just maintain the current value and where the interval started
  72.             val += value
  73.             start = point
  74.         assert val == 0
  75.  
  76. summers = defaultdict(Summer)
  77. for d in all.values():
  78.     for (name, start, stop), value in d.iteritems():
  79.         summers[name].additem(start, stop, value)
  80.  
  81. for name,s in summers.iteritems():
  82.     print name, list(s.intervals())
  83.        
  84. {"Chr1": {(121,122):10, (123,130):30, ...},
  85. "Chr2": {(230,233):16, ...},
  86. ...
  87. }
  88.        
  89. for (start, end), score in intervals_to_add.items():
  90.     overlapping = {}
  91.     for (start1, end1), score1 in current_chromosome.items():
  92.         if start1 <= start <= end1 or start1 <= end <= end1:
  93.             overlapping[(start1, end1)] = score1
  94.     for interval in overlapping:
  95.         current_chromosome.pop(interval)
  96.     # Process overlapping into smaller intervals, adding in the current interval
  97.     current_chromosome.update(new_intervals)