Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

Union/Intersection/Diff

By: ploffie on Nov 20th, 2012  |  syntax: Python  |  size: 8.58 KB  |  views: 45  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import heapq
  2.  
  3. def inter1(a, b):
  4.     """Loop over a and b and check if there are doubles"""
  5.     return [ai for ai in a for bi in b if ai == bi]
  6.  
  7. def union1(a, b):
  8.     """Loop over the elements of a+b"""
  9.     res = []
  10.     for x in a + b:
  11.         if x not in res:
  12.             res.append(x)
  13.     return res
  14.  
  15. def inter6(a, b):
  16.     """Make set of b and loop over a and check if ai in b"""
  17.     b = set(b)
  18.     return [ai for ai in a if ai in b]
  19.  
  20. def inter2(a, b):
  21.     """Loop over a and check if present in b"""
  22.     return [ai for ai in a if ai in b]
  23.  
  24. def union2(a, b):
  25.     """Output a + Loop over b and check if present in a"""
  26.     return a + [bi for bi in b if bi not in a]
  27.  
  28. def union6(a, b):
  29.     """Make set of a. Output a + loop over b and check if present in a"""
  30.     sa = set(a)
  31.     return a + [bi for bi in b if bi not in sa]
  32.  
  33. def inter3(a, b):
  34.     """Output a after removing all elements in b"""
  35.     res = list(a)
  36.     for ai in a:
  37.         if ai not in b: res.remove(ai)
  38.     return res
  39.  
  40. def inter4(a, b):
  41.     """Make heaps of a and b. Copy to output elements that are in a and b"""
  42.     ha = list(a)
  43.     hb = list(b)
  44.     res = []
  45.     heapq.heapify(ha)
  46.     heapq.heapify(hb)
  47.     while ha and hb:
  48.         if ha[0] < hb[0]:
  49.             heapq.heappop(ha)
  50.         elif ha[0] > hb[0]:
  51.             heapq.heappop(hb)
  52.         else:
  53.             res.append(heapq.heappop(ha))
  54.             heapq.heappop(hb)
  55.     return res
  56.            
  57. def union4(a, b):    
  58.     """Make heaps of a and b. Copy to output elements that are in a or b"""
  59.     ha = list(a)
  60.     hb = list(b)
  61.     res = []
  62.     heapq.heapify(ha)
  63.     heapq.heapify(hb)
  64.     rappend, heappop = res.append, heapq.heappop
  65.     while ha and hb:
  66.         if ha[0] < hb[0]:
  67.             rappend(heappop(ha))
  68.         elif ha[0] > hb[0]:
  69.             rappend(heappop(hb))
  70.         else:
  71.             rappend(heappop(ha))
  72.             heappop(hb)
  73.     return res + ha + hb # either ha or hb can contain a tail
  74.  
  75. def inter5(a, b):
  76.     """sort a+b and copy only duplicates by scanning"""
  77.     it = iter(sorted(a+b))
  78.     prev = next(it)
  79.     res = []
  80.     rapp = res.append
  81.     for e in it:
  82.         if e == prev:
  83.             rapp(e)
  84.         prev = e
  85.     return res
  86.        
  87. def union5(a, b):
  88.     """sort a+b and copy (duplicates only once) by scanning"""
  89.     it = iter(sorted(a+b))
  90.     prev = next(it)
  91.     res = [prev]
  92.     rapp = res.append
  93.     for e in it:
  94.         if e != prev:
  95.             rapp(e)
  96.         prev = e
  97.     return res
  98.        
  99. def inter99(a, b):
  100.     """Make set of a and b. Output list of intersection."""
  101.     return list(set(a) & set(b))
  102.  
  103. def union99(a, b):
  104.     """Make set of a and b. Output list of union."""
  105.     return list(set(a) | set(b))
  106.  
  107. def union99a(a, b):
  108.     """Output list of set of a + b."""
  109.     return list(set(a + b))
  110.  
  111. def diff1(a, b):
  112.     """Loop over a and check if ai not in b"""
  113.     return [ai for ai in a if ai not in b]
  114.    
  115. def diff4(a, b):    
  116.     """Make heaps of a and b. Copy to output elements that are i na but not in b"""
  117.     ha = list(a)
  118.     hb = list(b)
  119.     res = []
  120.     heapq.heapify(ha)
  121.     heapq.heapify(hb)
  122.     while ha and hb:
  123.         if ha[0] < hb[0]:
  124.             res.append(heapq.heappop(ha))
  125.         elif ha[0] > hb[0]:
  126.             heapq.heappop(hb)
  127.         else:
  128.             heapq.heappop(ha)
  129.             heapq.heappop(hb)
  130.     return res + ha # ha can contain a tail
  131.    
  132. def diff6(a, b):
  133.     """Make set of b and loop over a and check that ai not in set(b)"""
  134.     setb = set(b)
  135.     return [ai for ai in a if ai not in setb]
  136.    
  137. def diff99(a, b):
  138.     """Make sets of a and b and subtract"""
  139.     return list(set(a) - set(b))
  140.    
  141. if __name__ == '__main__':
  142.     from time import clock
  143.     from random import randint
  144.     from collections import defaultdict
  145.     import math
  146.    
  147.     def ranarr(n, N):
  148.         """choose n values from 0..N-1 - no doubles"""
  149.         data = range(N)
  150.         res = []
  151.         for i in range(n):
  152.             res.append(data.pop(randint(0, N - i - 1)))
  153.         return res
  154.    
  155.     def ranarr2(n, N, nov):
  156.         data = range(N)
  157.         a, b = [], []
  158.         for i in range(nov):
  159.             x = data.pop(randint(0, N - i - 1))
  160.             a.append(x)
  161.             b.append(x)
  162.         for i in range(n - nov):
  163.             a.append(data.pop(randint(0, N - (i + nov) - 1)))
  164.         for i in range(n - nov):
  165.             a.append(data.pop(randint(0, N - (i + nov + (n - nov)) - 1)))
  166.         return a, b
  167.            
  168.     timings = defaultdict(list)
  169.     def tim(method, a, b, n=100):
  170.         name = method.__name__
  171.         t0 = clock()
  172.         for i in xrange(n):
  173.             method(a, b)
  174.         ti = clock() - t0
  175.         timings[name].append((len(a), ti))
  176.        
  177.     Ns = [39, 40, 41, 159, 160, 161, 640, 1280, 1281, 5119, 5120, 5121]
  178.     a_arr, b_arr = {}, {}  
  179.     for N in Ns:
  180.         a_arr[N], b_arr[N] = ranarr2(N, 3 * N, N // 3)
  181.        
  182.    
  183.     intersections = [inter1, inter2, inter3, inter4, inter5, inter6, inter99]
  184.     unions = [union1, union2, union4, union5, union6, union99, union99a]
  185.     totime = [diff1, diff4, diff6, diff99]
  186.     for method in totime:
  187.         for N in Ns:
  188.             a = a_arr[N]
  189.             b = b_arr[N]
  190.             tim(method, a, b, 1)
  191.  
  192.     def fit_timings(model, t):
  193.         """models have form ti = a * f(ni) - linear without intercept"""
  194.         a = sum(ti * model(ni) for ni,ti in t) / sum(model(ni) ** 2 for ni,ti in t)
  195.         err2 = sum((a * model(ni) - ti) ** 2 for ni, ti in t)
  196.         return a, err2
  197.            
  198.     def fit_timings1(model, t):
  199.         """models have form ti = a * f(ni) - linear without intercept"""
  200.         n = len(t)
  201.         tav = sum(ti for ni, ti in t) / n
  202.         mav = sum(model(ni) for ni, ti in t) / n
  203.         a = sum((ti-tav) * model(ni) for ni,ti in t) / sum((model(ni)-mav) * model(ni) for ni,ti in t)
  204.         b = tav - a * mav
  205.         err2 = sum((a * model(ni) + b - ti) ** 2 for ni, ti in t)
  206.         #print b
  207.         return a, err2
  208.            
  209.     def check_models(t):
  210.         res = {}
  211.         besterr = float("inf")
  212.         mod_n = lambda n: n
  213.         mod_n2 = lambda n: n ** 2
  214.         mod_n3 = lambda n: n ** 3
  215.         mod_nlogn = lambda n: n * math.log(n)
  216.         for model, mname in [(mod_n, "O(n)") , (mod_n2, "O(n2)"),
  217.             (mod_n3, "O(n3)"), (mod_nlogn, "O(nlogn)")]:
  218.             coeff, err2 = fit_timings(model, t)
  219.             res[mname] = err2
  220.             if err2 < besterr:
  221.                 bestmethod = mname
  222.                 besterr = err2
  223.         serr, altmethod = min((v, key) for key, v in res.iteritems() if key != bestmethod)
  224.         ratio = serr / besterr
  225.         return bestmethod, coeff, ratio, altmethod, res
  226.                    
  227.     for method in totime:
  228.         name = method.__name__
  229.         doc = method.__doc__
  230.         times = timings[name]
  231.         bestmethod, coeff, ratio, altmethod, fiterr = check_models(times)
  232.         print "{0:8s} {1:8s} coeff: {2:6.2e}  ratio:{3:6.0f}   alt: {4:8s} "\
  233.             "{5:s}".format(name, bestmethod, coeff, ratio, altmethod, doc)
  234.  
  235. """
  236. inter1   O(n2)    coeff: 8.78e-05  ratio:   704   alt: O(nlogn) Loop over a and b and check if there are doubles
  237. inter2   O(n2)    coeff: 3.12e-05  ratio:  1958   alt: O(nlogn) Loop over a and check if present in b
  238. inter3   O(n2)    coeff: 3.47e-05  ratio:   510   alt: O(nlogn) Output a after removing all elements in b
  239. inter4   O(nlogn) coeff: 1.03e-05  ratio:    34   alt: O(n)     Make heaps of a and b. Copy to output elements that are in a and b
  240. inter99  O(n)     coeff: 2.71e-07  ratio:     5   alt: O(nlogn) Make set of a and b. Output list of intersection.
  241. union1   O(n2)    coeff: 5.83e-05  ratio:  7656   alt: O(nlogn) Loop over the elements of a+b
  242. union2   O(n2)    coeff: 3.06e-05  ratio:  1225   alt: O(nlogn) Output a + Loop over b and check if present in a
  243. union4   O(nlogn) coeff: 1.09e-05  ratio:     3   alt: O(n)     Make heaps of a and b. Copy to output elements that are in a or b
  244. union99  O(n)     coeff: 2.94e-07  ratio:     4   alt: O(nlogn) Make set of a and b. Output list of union.
  245. union99a O(nlogn) coeff: 2.30e-07  ratio:     1   alt: O(n)     Output list of set of a + b.
  246. diff1    O(n2)    coeff: 4.01e-06  ratio:    27   alt: O(n3)    Loop over a and check if ai not in b
  247. diff4    O(nlogn) coeff: 6.06e-07  ratio:     5   alt: O(n)     Make heaps of a and b. Copy to output elements that are i na but not in b
  248. diff6    O(n)     coeff: 1.38e-08  ratio:     4   alt: O(nlogn) Make set of b and loop over a and check that ai not in set(b)
  249. diff99   O(nlogn) coeff: 2.07e-08  ratio:     2   alt: O(n)     Make sets of a and b and subtract
  250. """