Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #-------------------------------------------------------------------------------
- # Name: listex
- # Purpose: list data type helper functions that operate like relational
- # algebra expressions such as union, intersection, etc. these are List to
- # List functions e.g. [1,2,3] insersection [0,2,4] -> [2]
- # Each function takes at least one list as argument(la) and a sort parameter
- # (sa). If the list is already sorted (all of these functions result in a
- # sorted list), about 1/2 processing time can be saved for subsequent
- # functions. The result is always a list.
- #-------------------------------------------------------------------------------
- def iter_intersection(a, b, key=None):
- """intersect(a, b) -> elements from a that are also in b
- Note that both iterables should be sorted (with the same key function
- passed here), for the same reason as with groupby.
- """
- a, b = iter(a), iter(b)
- try:
- nexta = next(a)
- keya = key(nexta) if key else nexta
- nextb = next(b)
- keyb = key(nextb) if key else nextb
- while True:
- if keya == keyb:
- yield nexta
- nexta = next(a)
- keya = key(nexta) if key else nexta
- # NOTE: If you wanted to add flags for multiset
- # intersection or pure set intersection, this is
- # where you'd add the change.
- elif keya < keyb:
- nexta = next(a)
- keya = key(nexta) if key else nexta
- else:
- nextb = next(b)
- keyb = key(nextb) if key else nextb
- except StopIteration:
- return
- #intersection
- # in A and B
- def list_intersection(la, lb, sa=True,sb=True):
- """return list of values in both A and B
- """
- # return list of values in both lists
- if sa:
- la = sorted(la)
- if sb:
- lb = sorted(lb)
- lena = len(la)
- lenb = len(lb)
- ia = 0
- ib = 0
- res = []
- while(ia<lena and ib< lenb):
- #print("{} == {}".format(la[ia],lb[ib]))
- if (la[ia] < lb[ib]):
- ia += 1
- elif (la[ia] > lb[ib]):
- ib += 1
- else:
- res.append(la[ia])
- ia += 1
- ib += 1
- return res
- #symetric difference
- # in A XOR B
- def list_symdif(la, lb, sa=True,sb=True):
- """return list of values in A XOR B
- """
- # return list of values in A XOR B
- if sa:
- la = sorted(la)
- if sb:
- lb = sorted(lb)
- lena = len(la)
- lenb = len(lb)
- ia = 0
- ib = 0
- res = []
- #head
- if (la[0] < lb[0]):
- while(ia<lena and la[ia] < lb[0]):
- #print("{} == {}".format(la[ia],lb[ib]))
- res.append(la[ia])
- ia += 1
- elif (la[0] > lb[0]):
- while(ib < lenb and lb[ib] < la[0]):
- #print("{} == {}".format(la[ia],lb[ib]))
- res.append(lb[ib])
- ib += 1
- #body
- while(ia<lena and ib< lenb):
- #print("{} == {}".format(la[ia],lb[ib]))
- if (la[ia] < lb[ib]):
- res.append(la[ia])
- ia += 1
- elif (la[ia] > lb[ib]):
- res.append(lb[ib])
- ib += 1
- else:
- ia += 1
- ib += 1
- #tail
- if (ia==lena):
- for j in range(ib,lenb):
- res.append(lb[j])
- elif (ib==lenb):
- for j in range(ia,lena):
- res.append(la[j])
- return res
- # UNION
- # in A OR B
- def list_union(la, lb, sa=True,sb=True):
- """return list of values in A OR B
- """
- # return list of values in A OR B
- if sa:
- la = sorted(la)
- if sb:
- lb = sorted(lb)
- lena = len(la)
- lenb = len(lb)
- ia = 0
- ib = 0
- res = []
- #head
- if lena>0 and lenb>0:
- if (la[0] < lb[0]):
- while(ia<lena and la[ia] < lb[0]):
- #print("{} == {}".format(la[ia],lb[ib]))
- res.append(la[ia])
- ia += 1
- elif (la[0] > lb[0]):
- while(ib < lenb and lb[ib] < la[0]):
- #print("{} == {}".format(la[ia],lb[ib]))
- res.append(lb[ib])
- ib += 1
- #body
- while(ia<lena and ib< lenb):
- #print("{} == {}".format(la[ia],lb[ib]))
- if (la[ia] < lb[ib]):
- res.append(la[ia])
- ia += 1
- elif (la[ia] > lb[ib]):
- res.append(lb[ib])
- ib += 1
- else:
- res.append(la[ia])
- ia += 1
- ib += 1
- #tail
- if (ia==lena):
- for j in range(ib,lenb):
- res.append(lb[j])
- elif (ib==lenb):
- for j in range(ia,lena):
- res.append(la[j])
- return res
- #end union()
- #relative complement
- # in A and not B
- def list_relcomp(la, lb, sa=True,sb=True):
- """return list of values in A and not B
- """
- # return list of values in A but not B
- if sa:
- la = sorted(la)
- if sb:
- lb = sorted(lb)
- lena = len(la)
- lenb = len(lb)
- ia = 0
- ib = 0
- res = []
- while(ia<lena and ib< lenb):
- #print("{} == {}".format(la[ia],lb[ib]))
- if (la[ia] < lb[ib]):
- res.append(la[ia])
- ia += 1
- elif (la[ia] > lb[ib]):
- ib += 1
- else:
- ia += 1
- ib += 1
- #tail
- if (ib==lenb):
- for j in range(ia,lena):
- res.append(la[j])
- return res
- def list_unique(la,sa=True):
- """return list of unique values in A
- sa - sort values in la first
- """
- if sa:
- la = sorted(la)
- lena = len(la)
- ia = 0
- prev = la[ia]
- res = []
- res.append(prev)
- while(ia<lena):
- if (la[ia] > prev):
- res.append(la[ia])
- prev = la[ia]
- ia += 1
- else:
- ia += 1
- return res
- #-------------------------------------------------------------------------------
- # Performance Test Section
- #-------------------------------------------------------------------------------
- def niave_relcomp(a,b):
- res = [x for x in a if x not in b]
- return res
- def relcomp_set(a, b):
- aset = a
- bset = b
- res = aset - bset
- res= list(res)
- return res
- def test(nA, nB, relationtype):
- """time test functions.
- relationtype 1=B subset of A, 2=A not in B
- """
- import collections
- import random
- import timeit
- if relationtype==1:
- print("\nFor A subset of B:")
- A = ["string"+str(x) for x in range(0,nA)]
- B = ["string"+str(x) for x in range(0,nB)]
- elif relationtype==2:
- print("\nFor A not in B:")
- A = ["string"+str(x) for x in range(0,nA)]
- B = ["strnng"+str(x) for x in range(0,nB)]
- elif relationtype==3:
- print("\nFor A not in B:")
- A = [0x7EF83945641E0000|x for x in range(0,nA)]
- B = [0x7EF43945641E0000|x for x in range(1,nB+1)]
- random.shuffle(A)
- random.shuffle(B)
- print("trying A {0}, B {1}".format(len(A),len(B)))
- t = timeit.timeit(lambda: set(A).intersection(B), number=100)
- print("set test", t)
- seta, setb = set(A), set(B)
- t = timeit.timeit(lambda: seta&setb, number=1000)
- print("prepared set test", t)
- t = timeit.timeit(lambda: list_intersection(A, B), number=100)
- print("listex test", t)
- sorta, sortb = sorted(A), sorted(B)
- t = timeit.timeit(lambda: list_intersection(sorta, sortb, False, False), number=1000)
- print("prepared listex test", t)
- def consume(it): collections.deque(it, maxlen=0)
- t = timeit.timeit(lambda: consume(iter_intersection(sorta, sortb)), number=1000)
- print("iterable test", t)
- def main():
- """run some performance tests on relative compare
- Note the built in set functions are implemented in C so this isn't really fair
- but seems to be useful for comparing large data sets.
- Naive implementation may be extremely slow so there is an option to skip
- those.
- """
- test(1000,5000,2)
- test(10000,50000,2)
- test(100000,500000,1)
- test(100,500,2)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement