numpy group_by EP first draft


"""
A note on pandas
This module has substantial overlap with pandas' grouping functionality.
So whats the reason for implementing it in numpy?

Primarily; the concept of grouping is far more general than pandas' dataframe.
There is no reason why numpy ndarrays should not have a solid core of grouping functionality.

Secondly, we aim for maximum performance; all ufuncs are supported for reductions


"""


import numpy as np


"""
A note on naming: 'Index' here refers to the fact that the goal of these classes is to
perform and store precomputations on a set of keys,
such as to accelerate subsequent operations involving these keys.

Note that these classes are not primarily intended to be used directly,
but rather are intended for code reuse within a family of related operations,
which are intended to be part of the numpy API.

That said, as the meshing example demonstrates these classes can also be very useful
in those places where the standard operations do not quite cover your needs,
saving your from completely reinventing the wheel.

All fuctions which take a 'keys' argument will take an already indexed set of keys as instead as well
This facilitates reuse of indexes across different operations
"""


"""
class hierarchy for indexing a set of keys
the class hierarchy allows for code reuse, while providing specializations for different
types of key objects, yielding maximum flexibility and performance
"""


class BaseIndex(object):
    """
    minimal indexing functionality
    only provides unique and counts, but with optimal performance
    no grouping, reductions, or lex-keys are supported,
    or anything that would require an indirect sort
    """
    def __init__(self, keys):
        """
        keys is a flat array of possibly compsite type
        """
        self.keys   = np.asarray(keys).flatten()
        self.sorted = np.sort(self.keys)
        #the slicing points of the bins to reduce over, as required by ufunc.reduceat
        self.flag   = self.sorted[:-1] != self.sorted[1:]
        self.slices = np.concatenate((
            [0],
            np.flatnonzero(self.flag)+1,
            [self.size]))

    @property
    def start(self):
        """start index of all bins"""
        return self.slices[:-1]
    @property
    def stop(self):
        """stop index of all bins"""
        return self.slices[1:]
    @property
    def groups(self):
        return len(self.start)
    @property
    def size(self):
        return self.keys.size


    @property
    def unique(self):
        """the first entry of each bin is a unique key"""
        return self.sorted[self.start]
    @property
    def count(self):
        """number of times each key occurs"""
        return np.diff(self.slices)
    @property
    def uniform(self):
        """returns true if each key occurs an equal number of times"""
        return not np.any(np.diff(self.count))


class Index(BaseIndex):
    """
    index object over a set of keys
    adds support for inversion and reduction
    """
    def __init__(self, keys):
        """
        keys is a flat array of possibly compsite type
        """
        self.keys   = np.asarray(keys).flatten()
        #find indices which sort the keys; needed later for values
        self.sorter = np.argsort(self.keys)
        #computed sorted keys
        self.sorted = self.keys[self.sorter]
        #the slicing points of the bins to reduce over, as required by ufunc.reduceat
        self.flag = self.sorted[:-1] != self.sorted[1:]
        self.slices = np.concatenate((
            [0],
            np.flatnonzero(self.flag)+1,
            [self.size]))

    @property
    def index(self):
        """ive never run into any use cases for this; but included for backwards compatibility"""
        return self.sorter[self.start]
    @property
    def inverse(self):
        """return index array that maps unique values back to original space"""
        cs = np.cumsum(np.concatenate(([False], self.flag)))
        isorter = np.argsort(self.sorter)
        return cs[isorter]


##    def where(self, other):
##        """
##        determine at which indices a second set equals a first
##        """
##        return np.searchsorted(self.unique, other)


class LexIndex(Index):
    """
    index object based on lexographic ordering of a set of keys
    we could also handle it as a struct array?
    """
    def __init__(self, keys):
        self.keys   = tuple(np.asarray(k).flatten() for k in keys)
        #find indices which sort the keys; needed later for values
        self.sorter = np.lexsort(self.keys)
        #computed sorted keys
        self.sorted = tuple(k[self.sorter] for k in self.keys)
        #the slicing points of the bins to reduce over, as required by ufunc.reduceat
        self.flag = reduce(
            np.logical_or,
            (s[:-1] != s[1:] for s in self.sorted))
        self.slices = np.concatenate((
            [0],
            np.flatnonzero(self.flag)+1,
            [self.size]))

    @property
    def unique(self):
        """the first entry of each bin is a unique key"""
        return tuple(s[self.start] for s in self.sorted)
    @property
    def size(self):
        return self.sorter.size


def axis_as_object(arr, axis=-1):
    """
    cast the given axis of an array to a void object
    if the axis to be cast is contiguous, a view is returned, otherwise a copy
    this is useful for efficiently sorting by the content of an axis, for instance
    """
    shape = arr.shape
    arr = np.ascontiguousarray(np.rollaxis(arr, axis, -1))
    return arr.view(np.dtype((np.void, arr.dtype.itemsize * shape[axis]))).reshape(np.delete(shape, axis))

def object_as_axis(arr, dtype, axis=-1):
    """
    cast an array of void objects to a typed axis
    """
    return np.rollaxis(arr.view(dtype).reshape(arr.shape+(-1,)), -1, axis)

class ObjectIndex(Index):
    """
    given axis of keys array viewed as void object
    groups will be formed on the basis of bitwise equality
    """
    def __init__(self, keys, axis=-1):
        self.axis = axis
        self.dtype = keys.dtype
        super(ObjectIndex, self).__init__(axis_as_object(keys, axis))

    @property
    def unique(self):
        """the first entry of each bin is a unique key"""
        return object_as_axis(self.sorted, self.dtype, self.axis)[self.start]


def as_index(keys, base=False):
    """
    standard casting of keys to index
    objectIndex is not created by default; only on demand
    """
    if isinstance(keys, Index):
        return keys
    if isinstance(keys, np.ndarray):
        if base:
            return BaseIndex(keys)
        else:
            return Index(keys)

    return LexIndex(keys)


"""
standard API starts here
"""


class GroupBy(object):
    """
    supports ufunc reduction
    should any other form of reduction be supported?
    not sure it should; more cleanly written externally i think
    """
    def __init__(self, keys):
        #we could inherit from Index, but the multiple specializations make
        #holding a reference to an index object more appropriate
        self.index = as_index(keys)

    #forward interesting index properties
##    @property
##    def keys(self):
##        #this alias makes sense in the context of a group, no?
##        return self.index.unique
    @property
    def unique(self):
        return self.index.unique
    @property
    def count(self):
        return self.index.count
    @property
    def inverse(self):
        return self.index.inverse


    def as_array(self, values):
        """
        return grouped values as an ndarray
        returns an array of shape [groups, groupsize, ungrouped-axes]
        this will only produce correct results if self.uniform==True
        checking this is left to the user!
        """
        values = values[self.index.sorter]
        return values.reshape(self.index.groups, self.index.count[0], *values.shape[1:])
    def as_list(self, values):
        """return grouped values as a list of arrays, or a jagged-array"""
        values = values[self.index.sorter]
        return np.split(values, self.index.slices[1:-1], axis=0)
    def group(self, values):
        return self.as_array(values) if self.index.uniform else self.as_list(values)
    def __call__(self, values):
        #not sure how i feel about this. explicit is better than implict
        #but group_by(keys).group(values) does not sit too well with me either
        return self.unique, self.group(values)


    def reduce(self, values, operator = np.add):
        """
        reduce the values over identical key groups, using the ufunc operator
        reduction is over the first axis, which should have elements corresponding to the keys
        all other axes are treated indepenently for the sake of this reduction
        note that this code is only C-vectorized over the first axis
        that is fine is this inner loop is significant, but not so much if it isnt
        if we have only few keys, but values[0].size is substantial, a reduction via
        as_list may be preferable
        """
        values = values[self.index.sorter]
        if values.ndim>1:
            func = lambda slc: operator.reduceat(slc, self.index.start)
            return np.apply_along_axis(func, 0, values)
        else:
            return operator.reduceat(values, self.index.start)

    #a bunch of common reduction operations
    def sum(self, values, axis=0):
        """compute the sum over each group"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        return self.unique, self.reduce(values)

    def mean(self, values, axis=0):
        """compute the mean over each group"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
        return self.unique, self.reduce(values) / count

    def var(self, values, axis=0):
        """compute the variance over each group"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
        mean = self.reduce(values) / count
        err = values - mean[self.inverse]
        return self.unique, self.reduce(err**2) / count

    def std(self, values, axis=0):
        """standard deviation over each group"""
        return np.unique, np.sqrt(self.var(values, axis)[1])

    def median(self, values, axis=0, average=True):
        """compute the median value over each group. pretty clever, if i may say so myself"""
        mid_2 = self.index.start + self.index.stop
        hi = (mid_2    ) // 2
        lo = (mid_2 - 1) // 2

        def median1d(slc):
            #place values at correct keys
            slc    = slc[self.index.sorter]
            #refine sorting within each keygroup
            sorter = np.lexsort((slc, self.index.sorted))
            slc    = slc[sorter]
            return (slc[lo]+slc[hi]) / 2 if average else slc[hi]

        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        if values.ndim>1:
            values = np.apply_along_axis(median1d, 0, values)
        else:
            values = median1d(values)
        return self.unique, values

    def min(self, values, axis=0):
        """return the minimum within each group"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        return self.unique, self.reduce(values, np.minimum)

    def max(self, values, axis=0):
        """return the maximum within each group"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        return self.unique, self.reduce(values, np.maximum)

    def first(self, values, axis=0):
        """return values at first occurance of its associated key"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        return self.unique, values[self.index.sorter[self.index.start]]

    def last(self, values, axis=0):
        """return values at last occurance of its associated key"""
        values = np.asarray(values)
        if axis: values = np.rollaxis(values, axis)
        return self.unique, values[self.index.sorter[self.index.stop-1]]


    #implement iter interface? could simply do zip( group_by(keys)(values)), no?

#just an alias, for those who dont like camelcase
group_by = GroupBy


"""
some demonstration of the duplicity between this code and np.unique
they share a lot of common functionality
we cant quite bootstrap grouping from unique as is
but unique can easily be reimplemented using the Index class
"""

def unique(keys, return_index = False, return_inverse = False, return_count = False):
    """
    equivalent interface to numpy.unique
    i believe the kwargs should be deprecated though
    cleaner to call index and its properties directly,
    should standard interface not suffice
    """
    index = as_index(keys, base = not (return_index or return_inverse))

    ret = index.unique,
    if return_index:
        ret = ret + (index.index,)
    if return_inverse:
        ret = ret + (index.inverse,)
    if return_count:
        ret = ret + (index.count,)
    return ret[0] if len(ret) == 1 else ret

def count(keys):
    """numpy equivalent of collections.Counter"""
    index = as_index(keys, base = True)
    return index.unique, index.count

def multiplicity(keys):
    """
    return the multiplicity of each key, or how often it occurs in the set
    note that we are not interested in the unique values for this operation,
    casting doubt on the current numpy design which places np.unique
    at the center of the interface for this kind of operation
    """
    index = as_index(keys)
    return index.count[index.inverse]


def incidence(boundary):
    """
    given an Nxm matrix containing boundary info between simplices
    , compute indidence info matrix
    """
    return group_by(boundary).group(np.arange(boundary.size) // boundary.shape[1])


def test_basic():
    keys   = np.array(["e", "b", "b", "c", "d", "e", "e", 'a'])
    values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1])


##    keys   = np.array(["a", "b", "b", "c", "d", "e", "e",'d','a','c'])
##    values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01, 1,2,3])

    for k,v in zip(*group_by(keys)(values)):
        print k, v

    g = group_by(keys)

    unique_keys, reduced_values = g.median(values)
    print 'per group median'
    print reduced_values
    unique_keys, reduced_values = g.mean(values)
    print 'per group mean'
    print reduced_values
    unique_keys, reduced_values = g.std(values)
    print 'per group std'
    print reduced_values


    reduced_values = g.reduce(values, np.minimum)
    print 'per group min'
    print reduced_values
    unique_keys, reduced_values = g.max(values)
    print 'per group max'
    print reduced_values

##test_basic()
##quit()

def test_fancy_keys():
    """
    test Index subclasses
    normal index operates on a flat array
    but we may operate on struct arrays, axes of a given array, or a sequence of keys as well
    """
    keys   = np.random.randint(0, 2, (100,3)).astype(np.int8)
    values = np.random.randint(-1,2,(100,4))

    struct_keys = np.zeros(len(keys), dtype='3int8, float32')
    struct_keys['f0'] = keys

    #all these various datastructures should produce the same indexing behavior
    assert(np.all(
        multiplicity(ObjectIndex(keys)) ==  #void object indexing
        multiplicity(tuple(keys.T))))       #lexographic indexing
    assert(np.all(
        multiplicity(ObjectIndex(keys)) ==  #void object indexing
        multiplicity(struct_keys)))         #struct array indexing

    #make our second field relevant
    struct_keys['f1'][0] = 3.1415
    unique_keys, reduced_values = group_by(struct_keys).sum(values)
    print 'sum per group of identical rows'
    for uk in unique_keys: print uk
    print reduced_values

##test_fancy_keys()
##quit()


def test_radial():
    x = np.linspace(-2,2, 64)
    y = x[:, None]
    x = x[None, :]
    R = np.sqrt( x**2+y**2)

    def airy(r, sigma):
        from scipy.special import j1
        r = r / sigma * np.sqrt(2)
        a = (2*j1(r)/r)**2
        a[r==0] = 1
        return a
    def gauss(r, sigma):
        return np.exp(-(r/sigma)**2)

    distribution = np.random.choice([gauss, airy])(R, 0.3)
    sample = np.random.poisson(distribution*100+10).astype(np.float)

    import matplotlib.pyplot as pp
    print 'is this an airy or gaussian function? hard to tell with all this noise!'
    pp.imshow(sample, interpolation='nearest')
    pp.show()
    #radial reduction to the rescue!
    print 'if we are sampling an airy function, you will see a small but significant rise around x=1'
    pp.plot(*group_by(np.round(R, 5)).mean(sample.flatten()))
    pp.xlim(0,2)
    pp.show()

test_radial()
quit()

def test_meshing():
    """
    meshing example
    """
    #set up some random points, and get their delaunay triangulation
    points = np.random.random((20,2))*2-1
    points = points[np.linalg.norm(points,axis=1) < 1]
    from scipy.spatial.qhull import Delaunay
    d = Delaunay(points)
    tris = d.simplices

    #the operations provided in this module allow us to express potentially complex
    #computational geometry questions elegantly in numpy
    #Delaunay.neighbors could be used as well,
    #but the point is to express this in pure numpy, without additional library functionaoty
    edges = tris[:,[[0,1],[1,2],[2,0]]].reshape(-1,2)
    sorted_edges = np.where(edges[:,0:1]<edges[:,1:2], edges, edges[:,::-1])
    #we test to see how often each edge occurs, or how many indicent simplices it has
    #this is a very general method of finding the boundary of any topology
    #and we can do so here with only one simple and readable command, multiplicity == 1
    index = ObjectIndex(sorted_edges)
    boundary_edges = edges[multiplicity(index)==1]
    boundary_points = unique(boundary_edges)

    if False:
        print boundary_edges
        print incidence(boundary_edges)
        print len( incidence(index))
        inverse = index.inverse
        FE = inverse.reshape(-1,3)
        print FE
        print incidence(FE)


    #create some random values on faces
    #we want to smooth them over the mesh to create a nice hilly landscape
    face_values   = np.random.random(d.nsimplex)
    #add some salt and pepper noise, to make our problem more interesting
    face_values[np.random.randint(d.nsimplex, size=10)] += 1000
    #in order to smooth, we need to map from faces to vertices, and back again
    #mapping from vertices to faces is simple; the converse not so much.
    #yet it can be done concisely and efficiently using a group reduction operation

    #start with a median step, to remove salt-and-pepper noise
    #toggle to see the effect of the median filter
    g = group_by(tris)
    prestep = g.median if True else g.mean
    _, vertex_values = prestep(np.repeat(face_values, 3))

    #actually, we can compute the mean without grouping
    tris_per_vert = g.count
    def scatter(x):
        r = np.zeros(d.npoints, x.dtype)
        for idx in tris.T: np.add.at(r, idx, x)
        return r / tris_per_vert
    def gather(x):
        return x[tris].mean(axis=1)

    for i in range(100):
        face_values   = gather(vertex_values)
        vertex_values = scatter(face_values)

    #display our nicely rolling hills and their boundary
    import matplotlib.pyplot as plt
    x, y = points.T
    plt.tripcolor(x,y, triangles = tris, facecolors = face_values)
    plt.scatter(x[boundary_points], y[boundary_points])
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.axis('equal')
    plt.show()

d = test_meshing()