- combining values for a large number of overlapping intervals of dictionary keys
- all={
- 1:{ ('a',123,145):20, ('a',155,170):12, ('b',234,345): 34},
- 2:{ ('a',121,135):10, ('a',155,175):28, ('b',230,345): 16},
- 3:{ ('a',130,140):20, ('a',150,170):10, ('b',234,345): 30},
- ...
- n: {...}
- }
- { ('a',121,122):10, ('a',123,130):30, ('a',131,135):50,
- ('a',136,140):40,('a',141,145):20, ...}
- all={
- 1:{ ('a',123,145):20, ('a',155,170):12, ('b',234,345): 34},
- 2:{ ('a',121,135):10, ('a',155,175):28, ('b',230,345): 16},
- 3:{ ('a',130,140):20, ('a',150,170):10, ('b',234,345): 30}
- }
- from collections import defaultdict
- summer = defaultdict(int)
- mini, maxi = 0,0
- for d in all.values():
- for (name, start, stop), value in d.iteritems():
- # im completely ignoring the `name` here, not sure if that's what you want
- # else just separate the data before doing this ...
- if mini == 0:
- mini = start
- mini, maxi = min(mini, start), max(maxi, stop)
- for i in range(start, stop+1):
- summer[i]+=value
- # now we have the values at each point, very redundant but very fast so far
- print summer
- # now we can find the intervals:
- def get_intervals(points, start, stop):
- cstart = start
- for i in range(start, stop+1):
- if points[cstart] != points[i]: # did the value change ?
- yield cstart, i-1, points[cstart]
- cstart = i
- if cstart != i:
- yield cstart, i, points[cstart]
- print list(get_intervals(summer, mini, maxi))
- [(121, 122, 10), (123, 129, 30), (130, 135, 50), (136, 140, 40), (141, 145, 20), (146, 149, 0), (150, 154, 10), (155, 170, 50), (171, 175, 28)]
- from collections import defaultdict
- from heapq import heappush, heappop
- class Summer(object):
- def __init__(self):
- # its a priority queue, kind of like a sorted list
- self.hq = []
- def additem(self, start, stop, value):
- # at `start` add it as a positive value
- heappush(self.hq, (start, value))
- # at `stop` subtract that value again
- heappush(self.hq, (stop, -value))
- def intervals(self):
- hq = self.hq
- start, val = heappop(hq)
- while hq:
- point, value = heappop(hq)
- yield start, point, val
- # just maintain the current value and where the interval started
- val += value
- start = point
- assert val == 0
- summers = defaultdict(Summer)
- for d in all.values():
- for (name, start, stop), value in d.iteritems():
- summers[name].additem(start, stop, value)
- for name,s in summers.iteritems():
- print name, list(s.intervals())
- {"Chr1": {(121,122):10, (123,130):30, ...},
- "Chr2": {(230,233):16, ...},
- ...
- }
- for (start, end), score in intervals_to_add.items():
- overlapping = {}
- for (start1, end1), score1 in current_chromosome.items():
- if start1 <= start <= end1 or start1 <= end <= end1:
- overlapping[(start1, end1)] = score1
- for interval in overlapping:
- current_chromosome.pop(interval)
- # Process overlapping into smaller intervals, adding in the current interval
- current_chromosome.update(new_intervals)