Advertisement
EelcoHoogendoorn

numpy group_by EP first draft

Jan 21st, 2014
651
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.91 KB | None | 0 0
  1.  
  2. """
  3. A note on pandas
  4. This module has substantial overlap with pandas' grouping functionality.
  5. So whats the reason for implementing it in numpy?
  6.  
  7. Primarily; the concept of grouping is far more general than pandas' dataframe.
  8. There is no reason why numpy ndarrays should not have a solid core of grouping functionality.
  9.  
  10. Secondly, we aim for maximum performance; all ufuncs are supported for reductions
  11.  
  12.  
  13. """
  14.  
  15.  
  16. import numpy as np
  17.  
  18.  
  19.  
  20. """
  21. A note on naming: 'Index' here refers to the fact that the goal of these classes is to
  22. perform and store precomputations on a set of keys,
  23. such as to accelerate subsequent operations involving these keys.
  24.  
  25. Note that these classes are not primarily intended to be used directly,
  26. but rather are intended for code reuse within a family of related operations,
  27. which are intended to be part of the numpy API.
  28.  
  29. That said, as the meshing example demonstrates these classes can also be very useful
  30. in those places where the standard operations do not quite cover your needs,
  31. saving your from completely reinventing the wheel.
  32.  
  33. All fuctions which take a 'keys' argument will take an already indexed set of keys as instead as well
  34. This facilitates reuse of indexes across different operations
  35. """
  36.  
  37.  
  38. """
  39. class hierarchy for indexing a set of keys
  40. the class hierarchy allows for code reuse, while providing specializations for different
  41. types of key objects, yielding maximum flexibility and performance
  42. """
  43.  
  44.  
  45. class BaseIndex(object):
  46.     """
  47.    minimal indexing functionality
  48.    only provides unique and counts, but with optimal performance
  49.    no grouping, reductions, or lex-keys are supported,
  50.    or anything that would require an indirect sort
  51.    """
  52.     def __init__(self, keys):
  53.         """
  54.        keys is a flat array of possibly compsite type
  55.        """
  56.         self.keys   = np.asarray(keys).flatten()
  57.         self.sorted = np.sort(self.keys)
  58.         #the slicing points of the bins to reduce over, as required by ufunc.reduceat
  59.         self.flag   = self.sorted[:-1] != self.sorted[1:]
  60.         self.slices = np.concatenate((
  61.             [0],
  62.             np.flatnonzero(self.flag)+1,
  63.             [self.size]))
  64.  
  65.     @property
  66.     def start(self):
  67.         """start index of all bins"""
  68.         return self.slices[:-1]
  69.     @property
  70.     def stop(self):
  71.         """stop index of all bins"""
  72.         return self.slices[1:]
  73.     @property
  74.     def groups(self):
  75.         return len(self.start)
  76.     @property
  77.     def size(self):
  78.         return self.keys.size
  79.  
  80.  
  81.     @property
  82.     def unique(self):
  83.         """the first entry of each bin is a unique key"""
  84.         return self.sorted[self.start]
  85.     @property
  86.     def count(self):
  87.         """number of times each key occurs"""
  88.         return np.diff(self.slices)
  89.     @property
  90.     def uniform(self):
  91.         """returns true if each key occurs an equal number of times"""
  92.         return not np.any(np.diff(self.count))
  93.  
  94.  
  95. class Index(BaseIndex):
  96.     """
  97.    index object over a set of keys
  98.    adds support for inversion and reduction
  99.    """
  100.     def __init__(self, keys):
  101.         """
  102.        keys is a flat array of possibly compsite type
  103.        """
  104.         self.keys   = np.asarray(keys).flatten()
  105.         #find indices which sort the keys; needed later for values
  106.         self.sorter = np.argsort(self.keys)
  107.         #computed sorted keys
  108.         self.sorted = self.keys[self.sorter]
  109.         #the slicing points of the bins to reduce over, as required by ufunc.reduceat
  110.         self.flag = self.sorted[:-1] != self.sorted[1:]
  111.         self.slices = np.concatenate((
  112.             [0],
  113.             np.flatnonzero(self.flag)+1,
  114.             [self.size]))
  115.  
  116.     @property
  117.     def index(self):
  118.         """ive never run into any use cases for this; but included for backwards compatibility"""
  119.         return self.sorter[self.start]
  120.     @property
  121.     def inverse(self):
  122.         """return index array that maps unique values back to original space"""
  123.         cs = np.cumsum(np.concatenate(([False], self.flag)))
  124.         isorter = np.argsort(self.sorter)
  125.         return cs[isorter]
  126.  
  127.  
  128. ##    def where(self, other):
  129. ##        """
  130. ##        determine at which indices a second set equals a first
  131. ##        """
  132. ##        return np.searchsorted(self.unique, other)
  133.  
  134.  
  135.  
  136. class LexIndex(Index):
  137.     """
  138.    index object based on lexographic ordering of a set of keys
  139.    we could also handle it as a struct array?
  140.    """
  141.     def __init__(self, keys):
  142.         self.keys   = tuple(np.asarray(k).flatten() for k in keys)
  143.         #find indices which sort the keys; needed later for values
  144.         self.sorter = np.lexsort(self.keys)
  145.         #computed sorted keys
  146.         self.sorted = tuple(k[self.sorter] for k in self.keys)
  147.         #the slicing points of the bins to reduce over, as required by ufunc.reduceat
  148.         self.flag = reduce(
  149.             np.logical_or,
  150.             (s[:-1] != s[1:] for s in self.sorted))
  151.         self.slices = np.concatenate((
  152.             [0],
  153.             np.flatnonzero(self.flag)+1,
  154.             [self.size]))
  155.  
  156.     @property
  157.     def unique(self):
  158.         """the first entry of each bin is a unique key"""
  159.         return tuple(s[self.start] for s in self.sorted)
  160.     @property
  161.     def size(self):
  162.         return self.sorter.size
  163.  
  164.  
  165. def axis_as_object(arr, axis=-1):
  166.     """
  167.    cast the given axis of an array to a void object
  168.    if the axis to be cast is contiguous, a view is returned, otherwise a copy
  169.    this is useful for efficiently sorting by the content of an axis, for instance
  170.    """
  171.     shape = arr.shape
  172.     arr = np.ascontiguousarray(np.rollaxis(arr, axis, -1))
  173.     return arr.view(np.dtype((np.void, arr.dtype.itemsize * shape[axis]))).reshape(np.delete(shape, axis))
  174.  
  175. def object_as_axis(arr, dtype, axis=-1):
  176.     """
  177.    cast an array of void objects to a typed axis
  178.    """
  179.     return np.rollaxis(arr.view(dtype).reshape(arr.shape+(-1,)), -1, axis)
  180.  
  181. class ObjectIndex(Index):
  182.     """
  183.    given axis of keys array viewed as void object
  184.    groups will be formed on the basis of bitwise equality
  185.    """
  186.     def __init__(self, keys, axis=-1):
  187.         self.axis = axis
  188.         self.dtype = keys.dtype
  189.         super(ObjectIndex, self).__init__(axis_as_object(keys, axis))
  190.  
  191.     @property
  192.     def unique(self):
  193.         """the first entry of each bin is a unique key"""
  194.         return object_as_axis(self.sorted, self.dtype, self.axis)[self.start]
  195.  
  196.  
  197. def as_index(keys, base=False):
  198.     """
  199.    standard casting of keys to index
  200.    objectIndex is not created by default; only on demand
  201.    """
  202.     if isinstance(keys, Index):
  203.         return keys
  204.     if isinstance(keys, np.ndarray):
  205.         if base:
  206.             return BaseIndex(keys)
  207.         else:
  208.             return Index(keys)
  209.  
  210.     return LexIndex(keys)
  211.  
  212.  
  213.  
  214.  
  215.  
  216. """
  217. standard API starts here
  218. """
  219.  
  220.  
  221.  
  222. class GroupBy(object):
  223.     """
  224.    supports ufunc reduction
  225.    should any other form of reduction be supported?
  226.    not sure it should; more cleanly written externally i think
  227.    """
  228.     def __init__(self, keys):
  229.         #we could inherit from Index, but the multiple specializations make
  230.         #holding a reference to an index object more appropriate
  231.         self.index = as_index(keys)
  232.  
  233.     #forward interesting index properties
  234. ##    @property
  235. ##    def keys(self):
  236. ##        #this alias makes sense in the context of a group, no?
  237. ##        return self.index.unique
  238.     @property
  239.     def unique(self):
  240.         return self.index.unique
  241.     @property
  242.     def count(self):
  243.         return self.index.count
  244.     @property
  245.     def inverse(self):
  246.         return self.index.inverse
  247.  
  248.  
  249.  
  250.     def as_array(self, values):
  251.         """
  252.        return grouped values as an ndarray
  253.        returns an array of shape [groups, groupsize, ungrouped-axes]
  254.        this will only produce correct results if self.uniform==True
  255.        checking this is left to the user!
  256.        """
  257.         values = values[self.index.sorter]
  258.         return values.reshape(self.index.groups, self.index.count[0], *values.shape[1:])
  259.     def as_list(self, values):
  260.         """return grouped values as a list of arrays, or a jagged-array"""
  261.         values = values[self.index.sorter]
  262.         return np.split(values, self.index.slices[1:-1], axis=0)
  263.     def group(self, values):
  264.         return self.as_array(values) if self.index.uniform else self.as_list(values)
  265.     def __call__(self, values):
  266.         #not sure how i feel about this. explicit is better than implict
  267.         #but group_by(keys).group(values) does not sit too well with me either
  268.         return self.unique, self.group(values)
  269.  
  270.  
  271.     def reduce(self, values, operator = np.add):
  272.         """
  273.        reduce the values over identical key groups, using the ufunc operator
  274.        reduction is over the first axis, which should have elements corresponding to the keys
  275.        all other axes are treated indepenently for the sake of this reduction
  276.        note that this code is only C-vectorized over the first axis
  277.        that is fine is this inner loop is significant, but not so much if it isnt
  278.        if we have only few keys, but values[0].size is substantial, a reduction via
  279.        as_list may be preferable
  280.        """
  281.         values = values[self.index.sorter]
  282.         if values.ndim>1:
  283.             func = lambda slc: operator.reduceat(slc, self.index.start)
  284.             return np.apply_along_axis(func, 0, values)
  285.         else:
  286.             return operator.reduceat(values, self.index.start)
  287.  
  288.     #a bunch of common reduction operations
  289.     def sum(self, values, axis=0):
  290.         """compute the sum over each group"""
  291.         values = np.asarray(values)
  292.         if axis: values = np.rollaxis(values, axis)
  293.         return self.unique, self.reduce(values)
  294.  
  295.     def mean(self, values, axis=0):
  296.         """compute the mean over each group"""
  297.         values = np.asarray(values)
  298.         if axis: values = np.rollaxis(values, axis)
  299.         count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
  300.         return self.unique, self.reduce(values) / count
  301.  
  302.     def var(self, values, axis=0):
  303.         """compute the variance over each group"""
  304.         values = np.asarray(values)
  305.         if axis: values = np.rollaxis(values, axis)
  306.         count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
  307.         mean = self.reduce(values) / count
  308.         err = values - mean[self.inverse]
  309.         return self.unique, self.reduce(err**2) / count
  310.  
  311.     def std(self, values, axis=0):
  312.         """standard deviation over each group"""
  313.         return np.unique, np.sqrt(self.var(values, axis)[1])
  314.  
  315.     def median(self, values, axis=0, average=True):
  316.         """compute the median value over each group. pretty clever, if i may say so myself"""
  317.         mid_2 = self.index.start + self.index.stop
  318.         hi = (mid_2    ) // 2
  319.         lo = (mid_2 - 1) // 2
  320.  
  321.         def median1d(slc):
  322.             #place values at correct keys
  323.             slc    = slc[self.index.sorter]
  324.             #refine sorting within each keygroup
  325.             sorter = np.lexsort((slc, self.index.sorted))
  326.             slc    = slc[sorter]
  327.             return (slc[lo]+slc[hi]) / 2 if average else slc[hi]
  328.  
  329.         values = np.asarray(values)
  330.         if axis: values = np.rollaxis(values, axis)
  331.         if values.ndim>1:
  332.             values = np.apply_along_axis(median1d, 0, values)
  333.         else:
  334.             values = median1d(values)
  335.         return self.unique, values
  336.  
  337.     def min(self, values, axis=0):
  338.         """return the minimum within each group"""
  339.         values = np.asarray(values)
  340.         if axis: values = np.rollaxis(values, axis)
  341.         return self.unique, self.reduce(values, np.minimum)
  342.  
  343.     def max(self, values, axis=0):
  344.         """return the maximum within each group"""
  345.         values = np.asarray(values)
  346.         if axis: values = np.rollaxis(values, axis)
  347.         return self.unique, self.reduce(values, np.maximum)
  348.  
  349.     def first(self, values, axis=0):
  350.         """return values at first occurance of its associated key"""
  351.         values = np.asarray(values)
  352.         if axis: values = np.rollaxis(values, axis)
  353.         return self.unique, values[self.index.sorter[self.index.start]]
  354.  
  355.     def last(self, values, axis=0):
  356.         """return values at last occurance of its associated key"""
  357.         values = np.asarray(values)
  358.         if axis: values = np.rollaxis(values, axis)
  359.         return self.unique, values[self.index.sorter[self.index.stop-1]]
  360.  
  361.  
  362.  
  363.     #implement iter interface? could simply do zip( group_by(keys)(values)), no?
  364.  
  365. #just an alias, for those who dont like camelcase
  366. group_by = GroupBy
  367.  
  368.  
  369.  
  370. """
  371. some demonstration of the duplicity between this code and np.unique
  372. they share a lot of common functionality
  373. we cant quite bootstrap grouping from unique as is
  374. but unique can easily be reimplemented using the Index class
  375. """
  376.  
  377. def unique(keys, return_index = False, return_inverse = False, return_count = False):
  378.     """
  379.    equivalent interface to numpy.unique
  380.    i believe the kwargs should be deprecated though
  381.    cleaner to call index and its properties directly,
  382.    should standard interface not suffice
  383.    """
  384.     index = as_index(keys, base = not (return_index or return_inverse))
  385.  
  386.     ret = index.unique,
  387.     if return_index:
  388.         ret = ret + (index.index,)
  389.     if return_inverse:
  390.         ret = ret + (index.inverse,)
  391.     if return_count:
  392.         ret = ret + (index.count,)
  393.     return ret[0] if len(ret) == 1 else ret
  394.  
  395. def count(keys):
  396.     """numpy equivalent of collections.Counter"""
  397.     index = as_index(keys, base = True)
  398.     return index.unique, index.count
  399.  
  400. def multiplicity(keys):
  401.     """
  402.    return the multiplicity of each key, or how often it occurs in the set
  403.    note that we are not interested in the unique values for this operation,
  404.    casting doubt on the current numpy design which places np.unique
  405.    at the center of the interface for this kind of operation
  406.    """
  407.     index = as_index(keys)
  408.     return index.count[index.inverse]
  409.  
  410.  
  411. def incidence(boundary):
  412.     """
  413.    given an Nxm matrix containing boundary info between simplices
  414.    , compute indidence info matrix
  415.    """
  416.     return group_by(boundary).group(np.arange(boundary.size) // boundary.shape[1])
  417.  
  418.  
  419.  
  420.  
  421.  
  422. def test_basic():
  423.     keys   = np.array(["e", "b", "b", "c", "d", "e", "e", 'a'])
  424.     values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1])
  425.  
  426.  
  427. ##    keys   = np.array(["a", "b", "b", "c", "d", "e", "e",'d','a','c'])
  428. ##    values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01, 1,2,3])
  429.  
  430.     for k,v in zip(*group_by(keys)(values)):
  431.         print k, v
  432.  
  433.     g = group_by(keys)
  434.  
  435.     unique_keys, reduced_values = g.median(values)
  436.     print 'per group median'
  437.     print reduced_values
  438.     unique_keys, reduced_values = g.mean(values)
  439.     print 'per group mean'
  440.     print reduced_values
  441.     unique_keys, reduced_values = g.std(values)
  442.     print 'per group std'
  443.     print reduced_values
  444.  
  445.  
  446.     reduced_values = g.reduce(values, np.minimum)
  447.     print 'per group min'
  448.     print reduced_values
  449.     unique_keys, reduced_values = g.max(values)
  450.     print 'per group max'
  451.     print reduced_values
  452.  
  453. ##test_basic()
  454. ##quit()
  455.  
  456. def test_fancy_keys():
  457.     """
  458.    test Index subclasses
  459.    normal index operates on a flat array
  460.    but we may operate on struct arrays, axes of a given array, or a sequence of keys as well
  461.    """
  462.     keys   = np.random.randint(0, 2, (100,3)).astype(np.int8)
  463.     values = np.random.randint(-1,2,(100,4))
  464.  
  465.     struct_keys = np.zeros(len(keys), dtype='3int8, float32')
  466.     struct_keys['f0'] = keys
  467.  
  468.     #all these various datastructures should produce the same indexing behavior
  469.     assert(np.all(
  470.         multiplicity(ObjectIndex(keys)) ==  #void object indexing
  471.         multiplicity(tuple(keys.T))))       #lexographic indexing
  472.     assert(np.all(
  473.         multiplicity(ObjectIndex(keys)) ==  #void object indexing
  474.         multiplicity(struct_keys)))         #struct array indexing
  475.  
  476.     #make our second field relevant
  477.     struct_keys['f1'][0] = 3.1415
  478.     unique_keys, reduced_values = group_by(struct_keys).sum(values)
  479.     print 'sum per group of identical rows'
  480.     for uk in unique_keys: print uk
  481.     print reduced_values
  482.  
  483. ##test_fancy_keys()
  484. ##quit()
  485.  
  486.  
  487. def test_radial():
  488.     x = np.linspace(-2,2, 64)
  489.     y = x[:, None]
  490.     x = x[None, :]
  491.     R = np.sqrt( x**2+y**2)
  492.  
  493.     def airy(r, sigma):
  494.         from scipy.special import j1
  495.         r = r / sigma * np.sqrt(2)
  496.         a = (2*j1(r)/r)**2
  497.         a[r==0] = 1
  498.         return a
  499.     def gauss(r, sigma):
  500.         return np.exp(-(r/sigma)**2)
  501.  
  502.     distribution = np.random.choice([gauss, airy])(R, 0.3)
  503.     sample = np.random.poisson(distribution*100+10).astype(np.float)
  504.  
  505.     import matplotlib.pyplot as pp
  506.     print 'is this an airy or gaussian function? hard to tell with all this noise!'
  507.     pp.imshow(sample, interpolation='nearest')
  508.     pp.show()
  509.     #radial reduction to the rescue!
  510.     print 'if we are sampling an airy function, you will see a small but significant rise around x=1'
  511.     pp.plot(*group_by(np.round(R, 5)).mean(sample.flatten()))
  512.     pp.xlim(0,2)
  513.     pp.show()
  514.  
  515. test_radial()
  516. quit()
  517.  
  518. def test_meshing():
  519.     """
  520.    meshing example
  521.    """
  522.     #set up some random points, and get their delaunay triangulation
  523.     points = np.random.random((20,2))*2-1
  524.     points = points[np.linalg.norm(points,axis=1) < 1]
  525.     from scipy.spatial.qhull import Delaunay
  526.     d = Delaunay(points)
  527.     tris = d.simplices
  528.  
  529.     #the operations provided in this module allow us to express potentially complex
  530.     #computational geometry questions elegantly in numpy
  531.     #Delaunay.neighbors could be used as well,
  532.     #but the point is to express this in pure numpy, without additional library functionaoty
  533.     edges = tris[:,[[0,1],[1,2],[2,0]]].reshape(-1,2)
  534.     sorted_edges = np.where(edges[:,0:1]<edges[:,1:2], edges, edges[:,::-1])
  535.     #we test to see how often each edge occurs, or how many indicent simplices it has
  536.     #this is a very general method of finding the boundary of any topology
  537.     #and we can do so here with only one simple and readable command, multiplicity == 1
  538.     index = ObjectIndex(sorted_edges)
  539.     boundary_edges = edges[multiplicity(index)==1]
  540.     boundary_points = unique(boundary_edges)
  541.  
  542.     if False:
  543.         print boundary_edges
  544.         print incidence(boundary_edges)
  545.         print len( incidence(index))
  546.         inverse = index.inverse
  547.         FE = inverse.reshape(-1,3)
  548.         print FE
  549.         print incidence(FE)
  550.  
  551.  
  552.     #create some random values on faces
  553.     #we want to smooth them over the mesh to create a nice hilly landscape
  554.     face_values   = np.random.random(d.nsimplex)
  555.     #add some salt and pepper noise, to make our problem more interesting
  556.     face_values[np.random.randint(d.nsimplex, size=10)] += 1000
  557.     #in order to smooth, we need to map from faces to vertices, and back again
  558.     #mapping from vertices to faces is simple; the converse not so much.
  559.     #yet it can be done concisely and efficiently using a group reduction operation
  560.  
  561.     #start with a median step, to remove salt-and-pepper noise
  562.     #toggle to see the effect of the median filter
  563.     g = group_by(tris)
  564.     prestep = g.median if True else g.mean
  565.     _, vertex_values = prestep(np.repeat(face_values, 3))
  566.  
  567.     #actually, we can compute the mean without grouping
  568.     tris_per_vert = g.count
  569.     def scatter(x):
  570.         r = np.zeros(d.npoints, x.dtype)
  571.         for idx in tris.T: np.add.at(r, idx, x)
  572.         return r / tris_per_vert
  573.     def gather(x):
  574.         return x[tris].mean(axis=1)
  575.  
  576.     for i in range(100):
  577.         face_values   = gather(vertex_values)
  578.         vertex_values = scatter(face_values)
  579.  
  580.     #display our nicely rolling hills and their boundary
  581.     import matplotlib.pyplot as plt
  582.     x, y = points.T
  583.     plt.tripcolor(x,y, triangles = tris, facecolors = face_values)
  584.     plt.scatter(x[boundary_points], y[boundary_points])
  585.     plt.xlim(-1,1)
  586.     plt.ylim(-1,1)
  587.     plt.axis('equal')
  588.     plt.show()
  589.  
  590. d = test_meshing()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement