listen.py

#-------------------------------------------------------------------------------
# Name:        listex
# Purpose: list data type helper functions that operate like relational
#   algebra expressions such as union, intersection, etc. these are List to
#   List functions e.g. [1,2,3] insersection [0,2,4] -> [2]
#   Each function takes at least one list as argument(la) and a sort parameter
#    (sa). If the list is already sorted (all of these functions result in a
#   sorted list), about 1/2 processing time can be saved for subsequent
#   functions. The result is always a list.
#-------------------------------------------------------------------------------

def iter_intersection(a, b, key=None):
    """intersect(a, b) -> elements from a that are also in b

    Note that both iterables should be sorted (with the same key function
    passed here), for the same reason as with groupby.
    """
    a, b = iter(a), iter(b)
    try:
        nexta = next(a)
        keya = key(nexta) if key else nexta
        nextb = next(b)
        keyb = key(nextb) if key else nextb
        while True:
            if keya == keyb:
                yield nexta
                nexta = next(a)
                keya = key(nexta) if key else nexta
                # NOTE: If you wanted to add flags for multiset
                # intersection or pure set intersection, this is
                # where you'd add the change.
            elif keya < keyb:
                nexta = next(a)
                keya = key(nexta) if key else nexta
            else:
                nextb = next(b)
                keyb = key(nextb) if key else nextb
    except StopIteration:
        return

#intersection
# in A and B
def list_intersection(la, lb, sa=True,sb=True):
    """return list of values in both A and B
    """
    # return list of values in both lists
    if sa:
        la = sorted(la)
    if sb:
        lb = sorted(lb)

    lena = len(la)
    lenb = len(lb)
    ia = 0
    ib = 0

    res = []
    while(ia<lena and ib< lenb):
        #print("{} == {}".format(la[ia],lb[ib]))
        if (la[ia] < lb[ib]):
            ia += 1
        elif (la[ia] > lb[ib]):
            ib += 1
        else:
            res.append(la[ia])
            ia += 1
            ib += 1

    return res

#symetric difference
# in A XOR B
def list_symdif(la, lb, sa=True,sb=True):
   """return list of values in A XOR B
   """
   # return list of values in A XOR B
   if sa:
     la = sorted(la)
   if sb:
     lb = sorted(lb)

   lena = len(la)
   lenb = len(lb)
   ia = 0
   ib = 0
   res = []

   #head
   if (la[0] < lb[0]):
     while(ia<lena and la[ia] < lb[0]):
         #print("{} == {}".format(la[ia],lb[ib]))
         res.append(la[ia])
         ia += 1
   elif (la[0] > lb[0]):
     while(ib < lenb and lb[ib] < la[0]):
         #print("{} == {}".format(la[ia],lb[ib]))
         res.append(lb[ib])
         ib += 1

   #body
   while(ia<lena and ib< lenb):
     #print("{} == {}".format(la[ia],lb[ib]))
     if (la[ia] < lb[ib]):
         res.append(la[ia])
         ia += 1
     elif (la[ia] > lb[ib]):
         res.append(lb[ib])
         ib += 1
     else:
         ia += 1
         ib += 1

   #tail
   if (ia==lena):
     for j in range(ib,lenb):
         res.append(lb[j])
   elif (ib==lenb):
     for j in range(ia,lena):
         res.append(la[j])

   return res

# UNION
# in A OR B
def list_union(la, lb, sa=True,sb=True):
    """return list of values in A OR B
    """
    # return list of values in A OR B
    if sa:
        la = sorted(la)
    if sb:
        lb = sorted(lb)

    lena = len(la)
    lenb = len(lb)
    ia = 0
    ib = 0
    res = []

    #head
    if lena>0 and lenb>0:
        if (la[0] < lb[0]):
            while(ia<lena and la[ia] < lb[0]):
                #print("{} == {}".format(la[ia],lb[ib]))
                res.append(la[ia])
                ia += 1
        elif (la[0] > lb[0]):
            while(ib < lenb and lb[ib] < la[0]):
                #print("{} == {}".format(la[ia],lb[ib]))
                res.append(lb[ib])
                ib += 1

    #body
    while(ia<lena and ib< lenb):
        #print("{} == {}".format(la[ia],lb[ib]))
        if (la[ia] < lb[ib]):
            res.append(la[ia])
            ia += 1
        elif (la[ia] > lb[ib]):
            res.append(lb[ib])
            ib += 1
        else:
            res.append(la[ia])
            ia += 1
            ib += 1

    #tail
    if (ia==lena):
        for j in range(ib,lenb):
            res.append(lb[j])
    elif (ib==lenb):
        for j in range(ia,lena):
            res.append(la[j])

    return res
#end union()


#relative complement
# in A and not B
def list_relcomp(la, lb, sa=True,sb=True):
    """return list of values in A and not B
    """
    # return list of values in A but not B
    if sa:
        la = sorted(la)
    if sb:
        lb = sorted(lb)

    lena = len(la)
    lenb = len(lb)
    ia = 0
    ib = 0

    res = []
    while(ia<lena and ib< lenb):
        #print("{} == {}".format(la[ia],lb[ib]))
        if (la[ia] < lb[ib]):
            res.append(la[ia])
            ia += 1
        elif (la[ia] > lb[ib]):
            ib += 1
        else:
            ia += 1
            ib += 1
    #tail
    if (ib==lenb):
        for j in range(ia,lena):
            res.append(la[j])

    return res


def list_unique(la,sa=True):
    """return list of unique values in A
      sa - sort values in la first
    """
    if sa:
        la = sorted(la)

    lena = len(la)
    ia = 0
    prev = la[ia]
    res = []
    res.append(prev)

    while(ia<lena):
        if (la[ia] > prev):
            res.append(la[ia])
            prev = la[ia]
            ia += 1
        else:
            ia += 1

    return res


#-------------------------------------------------------------------------------
#       Performance Test Section
#-------------------------------------------------------------------------------
def niave_relcomp(a,b):

    res = [x for x in a if x not in b]
    return res


def relcomp_set(a, b):
    aset = a
    bset = b

    res = aset - bset
    res= list(res)
    return res


def test(nA, nB, relationtype):
    """time test functions.
        relationtype 1=B subset of A, 2=A not in B
    """
    import collections
    import random
    import timeit

    if relationtype==1:
        print("\nFor A subset of B:")
        A = ["string"+str(x) for x in range(0,nA)]
        B = ["string"+str(x) for x in range(0,nB)]
    elif relationtype==2:
        print("\nFor A not in B:")
        A = ["string"+str(x) for x in range(0,nA)]
        B = ["strnng"+str(x) for x in range(0,nB)]
    elif relationtype==3:
        print("\nFor A not in B:")
        A = [0x7EF83945641E0000|x for x in range(0,nA)]
        B = [0x7EF43945641E0000|x for x in range(1,nB+1)]

    random.shuffle(A)
    random.shuffle(B)

    print("trying A {0}, B {1}".format(len(A),len(B)))

    t = timeit.timeit(lambda: set(A).intersection(B), number=100)
    print("set test", t)

    seta, setb = set(A), set(B)
    t = timeit.timeit(lambda: seta&setb, number=1000)
    print("prepared set test", t)

    t = timeit.timeit(lambda: list_intersection(A, B), number=100)
    print("listex test", t)

    sorta, sortb = sorted(A), sorted(B)
    t = timeit.timeit(lambda: list_intersection(sorta, sortb, False, False), number=1000)
    print("prepared listex test", t)

    def consume(it): collections.deque(it, maxlen=0)
    t = timeit.timeit(lambda: consume(iter_intersection(sorta, sortb)), number=1000)
    print("iterable test", t)


def main():
    """run some performance tests on relative compare
    Note the built in set functions are implemented in C so this isn't really fair
    but seems to be useful for comparing large data sets.
    Naive implementation may be extremely slow so there is an option to skip
    those.
    """

    test(1000,5000,2)

    test(10000,50000,2)

    test(100000,500000,1)

    test(100,500,2)


if __name__ == '__main__':
    main()