Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- A note on pandas
- This module has substantial overlap with pandas' grouping functionality.
- So whats the reason for implementing it in numpy?
- Primarily; the concept of grouping is far more general than pandas' dataframe.
- There is no reason why numpy ndarrays should not have a solid core of grouping functionality.
- Secondly, we aim for maximum performance; all ufuncs are supported for reductions
- """
- import numpy as np
- """
- A note on naming: 'Index' here refers to the fact that the goal of these classes is to
- perform and store precomputations on a set of keys,
- such as to accelerate subsequent operations involving these keys.
- Note that these classes are not primarily intended to be used directly,
- but rather are intended for code reuse within a family of related operations,
- which are intended to be part of the numpy API.
- That said, as the meshing example demonstrates these classes can also be very useful
- in those places where the standard operations do not quite cover your needs,
- saving your from completely reinventing the wheel.
- All fuctions which take a 'keys' argument will take an already indexed set of keys as instead as well
- This facilitates reuse of indexes across different operations
- """
- """
- class hierarchy for indexing a set of keys
- the class hierarchy allows for code reuse, while providing specializations for different
- types of key objects, yielding maximum flexibility and performance
- """
- class BaseIndex(object):
- """
- minimal indexing functionality
- only provides unique and counts, but with optimal performance
- no grouping, reductions, or lex-keys are supported,
- or anything that would require an indirect sort
- """
- def __init__(self, keys):
- """
- keys is a flat array of possibly compsite type
- """
- self.keys = np.asarray(keys).flatten()
- self.sorted = np.sort(self.keys)
- #the slicing points of the bins to reduce over, as required by ufunc.reduceat
- self.flag = self.sorted[:-1] != self.sorted[1:]
- self.slices = np.concatenate((
- [0],
- np.flatnonzero(self.flag)+1,
- [self.size]))
- @property
- def start(self):
- """start index of all bins"""
- return self.slices[:-1]
- @property
- def stop(self):
- """stop index of all bins"""
- return self.slices[1:]
- @property
- def groups(self):
- return len(self.start)
- @property
- def size(self):
- return self.keys.size
- @property
- def unique(self):
- """the first entry of each bin is a unique key"""
- return self.sorted[self.start]
- @property
- def count(self):
- """number of times each key occurs"""
- return np.diff(self.slices)
- @property
- def uniform(self):
- """returns true if each key occurs an equal number of times"""
- return not np.any(np.diff(self.count))
- class Index(BaseIndex):
- """
- index object over a set of keys
- adds support for inversion and reduction
- """
- def __init__(self, keys):
- """
- keys is a flat array of possibly compsite type
- """
- self.keys = np.asarray(keys).flatten()
- #find indices which sort the keys; needed later for values
- self.sorter = np.argsort(self.keys)
- #computed sorted keys
- self.sorted = self.keys[self.sorter]
- #the slicing points of the bins to reduce over, as required by ufunc.reduceat
- self.flag = self.sorted[:-1] != self.sorted[1:]
- self.slices = np.concatenate((
- [0],
- np.flatnonzero(self.flag)+1,
- [self.size]))
- @property
- def index(self):
- """ive never run into any use cases for this; but included for backwards compatibility"""
- return self.sorter[self.start]
- @property
- def inverse(self):
- """return index array that maps unique values back to original space"""
- cs = np.cumsum(np.concatenate(([False], self.flag)))
- isorter = np.argsort(self.sorter)
- return cs[isorter]
- ## def where(self, other):
- ## """
- ## determine at which indices a second set equals a first
- ## """
- ## return np.searchsorted(self.unique, other)
- class LexIndex(Index):
- """
- index object based on lexographic ordering of a set of keys
- we could also handle it as a struct array?
- """
- def __init__(self, keys):
- self.keys = tuple(np.asarray(k).flatten() for k in keys)
- #find indices which sort the keys; needed later for values
- self.sorter = np.lexsort(self.keys)
- #computed sorted keys
- self.sorted = tuple(k[self.sorter] for k in self.keys)
- #the slicing points of the bins to reduce over, as required by ufunc.reduceat
- self.flag = reduce(
- np.logical_or,
- (s[:-1] != s[1:] for s in self.sorted))
- self.slices = np.concatenate((
- [0],
- np.flatnonzero(self.flag)+1,
- [self.size]))
- @property
- def unique(self):
- """the first entry of each bin is a unique key"""
- return tuple(s[self.start] for s in self.sorted)
- @property
- def size(self):
- return self.sorter.size
- def axis_as_object(arr, axis=-1):
- """
- cast the given axis of an array to a void object
- if the axis to be cast is contiguous, a view is returned, otherwise a copy
- this is useful for efficiently sorting by the content of an axis, for instance
- """
- shape = arr.shape
- arr = np.ascontiguousarray(np.rollaxis(arr, axis, -1))
- return arr.view(np.dtype((np.void, arr.dtype.itemsize * shape[axis]))).reshape(np.delete(shape, axis))
- def object_as_axis(arr, dtype, axis=-1):
- """
- cast an array of void objects to a typed axis
- """
- return np.rollaxis(arr.view(dtype).reshape(arr.shape+(-1,)), -1, axis)
- class ObjectIndex(Index):
- """
- given axis of keys array viewed as void object
- groups will be formed on the basis of bitwise equality
- """
- def __init__(self, keys, axis=-1):
- self.axis = axis
- self.dtype = keys.dtype
- super(ObjectIndex, self).__init__(axis_as_object(keys, axis))
- @property
- def unique(self):
- """the first entry of each bin is a unique key"""
- return object_as_axis(self.sorted, self.dtype, self.axis)[self.start]
- def as_index(keys, base=False):
- """
- standard casting of keys to index
- objectIndex is not created by default; only on demand
- """
- if isinstance(keys, Index):
- return keys
- if isinstance(keys, np.ndarray):
- if base:
- return BaseIndex(keys)
- else:
- return Index(keys)
- return LexIndex(keys)
- """
- standard API starts here
- """
- class GroupBy(object):
- """
- supports ufunc reduction
- should any other form of reduction be supported?
- not sure it should; more cleanly written externally i think
- """
- def __init__(self, keys):
- #we could inherit from Index, but the multiple specializations make
- #holding a reference to an index object more appropriate
- self.index = as_index(keys)
- #forward interesting index properties
- ## @property
- ## def keys(self):
- ## #this alias makes sense in the context of a group, no?
- ## return self.index.unique
- @property
- def unique(self):
- return self.index.unique
- @property
- def count(self):
- return self.index.count
- @property
- def inverse(self):
- return self.index.inverse
- def as_array(self, values):
- """
- return grouped values as an ndarray
- returns an array of shape [groups, groupsize, ungrouped-axes]
- this will only produce correct results if self.uniform==True
- checking this is left to the user!
- """
- values = values[self.index.sorter]
- return values.reshape(self.index.groups, self.index.count[0], *values.shape[1:])
- def as_list(self, values):
- """return grouped values as a list of arrays, or a jagged-array"""
- values = values[self.index.sorter]
- return np.split(values, self.index.slices[1:-1], axis=0)
- def group(self, values):
- return self.as_array(values) if self.index.uniform else self.as_list(values)
- def __call__(self, values):
- #not sure how i feel about this. explicit is better than implict
- #but group_by(keys).group(values) does not sit too well with me either
- return self.unique, self.group(values)
- def reduce(self, values, operator = np.add):
- """
- reduce the values over identical key groups, using the ufunc operator
- reduction is over the first axis, which should have elements corresponding to the keys
- all other axes are treated indepenently for the sake of this reduction
- note that this code is only C-vectorized over the first axis
- that is fine is this inner loop is significant, but not so much if it isnt
- if we have only few keys, but values[0].size is substantial, a reduction via
- as_list may be preferable
- """
- values = values[self.index.sorter]
- if values.ndim>1:
- func = lambda slc: operator.reduceat(slc, self.index.start)
- return np.apply_along_axis(func, 0, values)
- else:
- return operator.reduceat(values, self.index.start)
- #a bunch of common reduction operations
- def sum(self, values, axis=0):
- """compute the sum over each group"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- return self.unique, self.reduce(values)
- def mean(self, values, axis=0):
- """compute the mean over each group"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
- return self.unique, self.reduce(values) / count
- def var(self, values, axis=0):
- """compute the variance over each group"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- count = self.index.count.reshape(-1,*(1,)*(values.ndim-1))
- mean = self.reduce(values) / count
- err = values - mean[self.inverse]
- return self.unique, self.reduce(err**2) / count
- def std(self, values, axis=0):
- """standard deviation over each group"""
- return np.unique, np.sqrt(self.var(values, axis)[1])
- def median(self, values, axis=0, average=True):
- """compute the median value over each group. pretty clever, if i may say so myself"""
- mid_2 = self.index.start + self.index.stop
- hi = (mid_2 ) // 2
- lo = (mid_2 - 1) // 2
- def median1d(slc):
- #place values at correct keys
- slc = slc[self.index.sorter]
- #refine sorting within each keygroup
- sorter = np.lexsort((slc, self.index.sorted))
- slc = slc[sorter]
- return (slc[lo]+slc[hi]) / 2 if average else slc[hi]
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- if values.ndim>1:
- values = np.apply_along_axis(median1d, 0, values)
- else:
- values = median1d(values)
- return self.unique, values
- def min(self, values, axis=0):
- """return the minimum within each group"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- return self.unique, self.reduce(values, np.minimum)
- def max(self, values, axis=0):
- """return the maximum within each group"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- return self.unique, self.reduce(values, np.maximum)
- def first(self, values, axis=0):
- """return values at first occurance of its associated key"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- return self.unique, values[self.index.sorter[self.index.start]]
- def last(self, values, axis=0):
- """return values at last occurance of its associated key"""
- values = np.asarray(values)
- if axis: values = np.rollaxis(values, axis)
- return self.unique, values[self.index.sorter[self.index.stop-1]]
- #implement iter interface? could simply do zip( group_by(keys)(values)), no?
- #just an alias, for those who dont like camelcase
- group_by = GroupBy
- """
- some demonstration of the duplicity between this code and np.unique
- they share a lot of common functionality
- we cant quite bootstrap grouping from unique as is
- but unique can easily be reimplemented using the Index class
- """
- def unique(keys, return_index = False, return_inverse = False, return_count = False):
- """
- equivalent interface to numpy.unique
- i believe the kwargs should be deprecated though
- cleaner to call index and its properties directly,
- should standard interface not suffice
- """
- index = as_index(keys, base = not (return_index or return_inverse))
- ret = index.unique,
- if return_index:
- ret = ret + (index.index,)
- if return_inverse:
- ret = ret + (index.inverse,)
- if return_count:
- ret = ret + (index.count,)
- return ret[0] if len(ret) == 1 else ret
- def count(keys):
- """numpy equivalent of collections.Counter"""
- index = as_index(keys, base = True)
- return index.unique, index.count
- def multiplicity(keys):
- """
- return the multiplicity of each key, or how often it occurs in the set
- note that we are not interested in the unique values for this operation,
- casting doubt on the current numpy design which places np.unique
- at the center of the interface for this kind of operation
- """
- index = as_index(keys)
- return index.count[index.inverse]
- def incidence(boundary):
- """
- given an Nxm matrix containing boundary info between simplices
- , compute indidence info matrix
- """
- return group_by(boundary).group(np.arange(boundary.size) // boundary.shape[1])
- def test_basic():
- keys = np.array(["e", "b", "b", "c", "d", "e", "e", 'a'])
- values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1])
- ## keys = np.array(["a", "b", "b", "c", "d", "e", "e",'d','a','c'])
- ## values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01, 1,2,3])
- for k,v in zip(*group_by(keys)(values)):
- print k, v
- g = group_by(keys)
- unique_keys, reduced_values = g.median(values)
- print 'per group median'
- print reduced_values
- unique_keys, reduced_values = g.mean(values)
- print 'per group mean'
- print reduced_values
- unique_keys, reduced_values = g.std(values)
- print 'per group std'
- print reduced_values
- reduced_values = g.reduce(values, np.minimum)
- print 'per group min'
- print reduced_values
- unique_keys, reduced_values = g.max(values)
- print 'per group max'
- print reduced_values
- ##test_basic()
- ##quit()
- def test_fancy_keys():
- """
- test Index subclasses
- normal index operates on a flat array
- but we may operate on struct arrays, axes of a given array, or a sequence of keys as well
- """
- keys = np.random.randint(0, 2, (100,3)).astype(np.int8)
- values = np.random.randint(-1,2,(100,4))
- struct_keys = np.zeros(len(keys), dtype='3int8, float32')
- struct_keys['f0'] = keys
- #all these various datastructures should produce the same indexing behavior
- assert(np.all(
- multiplicity(ObjectIndex(keys)) == #void object indexing
- multiplicity(tuple(keys.T)))) #lexographic indexing
- assert(np.all(
- multiplicity(ObjectIndex(keys)) == #void object indexing
- multiplicity(struct_keys))) #struct array indexing
- #make our second field relevant
- struct_keys['f1'][0] = 3.1415
- unique_keys, reduced_values = group_by(struct_keys).sum(values)
- print 'sum per group of identical rows'
- for uk in unique_keys: print uk
- print reduced_values
- ##test_fancy_keys()
- ##quit()
- def test_radial():
- x = np.linspace(-2,2, 64)
- y = x[:, None]
- x = x[None, :]
- R = np.sqrt( x**2+y**2)
- def airy(r, sigma):
- from scipy.special import j1
- r = r / sigma * np.sqrt(2)
- a = (2*j1(r)/r)**2
- a[r==0] = 1
- return a
- def gauss(r, sigma):
- return np.exp(-(r/sigma)**2)
- distribution = np.random.choice([gauss, airy])(R, 0.3)
- sample = np.random.poisson(distribution*100+10).astype(np.float)
- import matplotlib.pyplot as pp
- print 'is this an airy or gaussian function? hard to tell with all this noise!'
- pp.imshow(sample, interpolation='nearest')
- pp.show()
- #radial reduction to the rescue!
- print 'if we are sampling an airy function, you will see a small but significant rise around x=1'
- pp.plot(*group_by(np.round(R, 5)).mean(sample.flatten()))
- pp.xlim(0,2)
- pp.show()
- test_radial()
- quit()
- def test_meshing():
- """
- meshing example
- """
- #set up some random points, and get their delaunay triangulation
- points = np.random.random((20,2))*2-1
- points = points[np.linalg.norm(points,axis=1) < 1]
- from scipy.spatial.qhull import Delaunay
- d = Delaunay(points)
- tris = d.simplices
- #the operations provided in this module allow us to express potentially complex
- #computational geometry questions elegantly in numpy
- #Delaunay.neighbors could be used as well,
- #but the point is to express this in pure numpy, without additional library functionaoty
- edges = tris[:,[[0,1],[1,2],[2,0]]].reshape(-1,2)
- sorted_edges = np.where(edges[:,0:1]<edges[:,1:2], edges, edges[:,::-1])
- #we test to see how often each edge occurs, or how many indicent simplices it has
- #this is a very general method of finding the boundary of any topology
- #and we can do so here with only one simple and readable command, multiplicity == 1
- index = ObjectIndex(sorted_edges)
- boundary_edges = edges[multiplicity(index)==1]
- boundary_points = unique(boundary_edges)
- if False:
- print boundary_edges
- print incidence(boundary_edges)
- print len( incidence(index))
- inverse = index.inverse
- FE = inverse.reshape(-1,3)
- print FE
- print incidence(FE)
- #create some random values on faces
- #we want to smooth them over the mesh to create a nice hilly landscape
- face_values = np.random.random(d.nsimplex)
- #add some salt and pepper noise, to make our problem more interesting
- face_values[np.random.randint(d.nsimplex, size=10)] += 1000
- #in order to smooth, we need to map from faces to vertices, and back again
- #mapping from vertices to faces is simple; the converse not so much.
- #yet it can be done concisely and efficiently using a group reduction operation
- #start with a median step, to remove salt-and-pepper noise
- #toggle to see the effect of the median filter
- g = group_by(tris)
- prestep = g.median if True else g.mean
- _, vertex_values = prestep(np.repeat(face_values, 3))
- #actually, we can compute the mean without grouping
- tris_per_vert = g.count
- def scatter(x):
- r = np.zeros(d.npoints, x.dtype)
- for idx in tris.T: np.add.at(r, idx, x)
- return r / tris_per_vert
- def gather(x):
- return x[tris].mean(axis=1)
- for i in range(100):
- face_values = gather(vertex_values)
- vertex_values = scatter(face_values)
- #display our nicely rolling hills and their boundary
- import matplotlib.pyplot as plt
- x, y = points.T
- plt.tripcolor(x,y, triangles = tris, facecolors = face_values)
- plt.scatter(x[boundary_points], y[boundary_points])
- plt.xlim(-1,1)
- plt.ylim(-1,1)
- plt.axis('equal')
- plt.show()
- d = test_meshing()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement