SHOW:
|
|
- or go back to the newest paste.
1 | """ | |
2 | note on np.unique | |
3 | ||
4 | this module expands on a the functionality one would currently bootstrap from np.unique | |
5 | to facilitate the functionality implemented in this module, a generalization of unique is proposed | |
6 | that is, the version of unique implemented here also takes a lex-sortable tuple, | |
7 | or multidimensional array viewed along one of its axes as a valid input; ie: | |
8 | ||
9 | #stack of 4 images | |
10 | - | Secondly, we aim for maximum performance; all ufuncs are supported for reductions |
10 | + | images = np.random.rand(4,64,64) |
11 | #shuffle the images; this is a giant mess now; how to find the unique ones? | |
12 | shuffled = images[np.random.randint(0,4,200)] | |
13 | #there you go | |
14 | print unique(shuffled, axis=0) | |
15 | ||
16 | furthermore, some np.unique related functions are proposed, with a more friendly interface | |
17 | (multiplicity, count) | |
18 | """ | |
19 | ||
20 | ||
21 | """ | |
22 | A note on pandas | |
23 | This module has substantial overlap with pandas' grouping functionality. | |
24 | So whats the reason for implementing it in numpy? | |
25 | - | Note that these classes are not primarily intended to be used directly, |
25 | + | |
26 | - | but rather are intended for code reuse within a family of related operations, |
26 | + | |
27 | - | which are intended to be part of the numpy API. |
27 | + | |
28 | ||
29 | - | That said, as the meshing example demonstrates these classes can also be very useful |
29 | + | The recently added ufunc support make that we can now express grouping operations in pure numpy; |
30 | that is, without any slow python loops or cumbersome C-extensions. | |
31 | ||
32 | I was not aware of pandas functionalty on this front when starting working on it; | |
33 | - | All fuctions which take a 'keys' argument will take an already indexed set of keys as instead as well |
33 | + | so i am somewhat pleased to say that many core design elements have turned out the same. |
34 | - | This facilitates reuse of indexes across different operations |
34 | + | I take that to mean these design decision probably make sense. |
35 | ||
36 | It does raise the question as to where the proper line between pandas and numpy lies. | |
37 | I would argue that evidently, most of pandas functionality has no place in numpy. | |
38 | Then how is grouping different? I feel what lies at the heart of pandas is a permanent conceptual | |
39 | association between various pieces of data, assembled in a dataframe and all its metadata. | |
40 | I think numpy ought to stay well clear of that. | |
41 | On the other hand, you dont want want to go around creating a pandas dataframe | |
42 | just to plot a radial reduction; These kind of transient single-statement associations | |
43 | between keys and values are very useful, entirely independently of some heavyweight framework | |
44 | ||
45 | Further questions raised from pandas: should we have some form of merge/join functionality too? | |
46 | or is this getting too panda-ey? all the use cases i can think of fail to be pressed | |
47 | in some kind of standard mould, but i might be missing something here | |
48 | """ | |
49 | - | no grouping, reductions, or lex-keys are supported, |
49 | + | |
50 | ||
51 | import numpy as np | |
52 | import itertools | |
53 | ||
54 | ||
55 | ||
56 | ||
57 | ||
58 | - | #the slicing points of the bins to reduce over, as required by ufunc.reduceat |
58 | + | |
59 | """ | |
60 | this toggle switches between preferred or backwards compatible semantics | |
61 | for dealing with key objects. current behavior for the arguments to functions | |
62 | like np.unique is to flatten any input arrays. | |
63 | i think a more unified semantics is achieved by interpreting all key arguments as | |
64 | sequences of key objects, whereby non-flat arrays are simply sequences of keys, | |
65 | whereby the keys themselves are ndim-1-arrays | |
66 | for reasons of backwards compatibility, it is probably wise to retain the default, | |
67 | but at least an axis keyword to toggle this behavior would be a welcome addition | |
68 | """ | |
69 | backwards_compatible = False | |
70 | if backwards_compatible: | |
71 | axis_default = None | |
72 | else: | |
73 | axis_default = 0 | |
74 | ||
75 | ||
76 | ||
77 | """ | |
78 | some utility functions | |
79 | """ | |
80 | ||
81 | def as_struct_array(*cols): | |
82 | """pack a bunch of columns as a struct""" | |
83 | cols = [np.asarray(c) for c in cols] | |
84 | rows = len(cols[0]) | |
85 | data = np.empty(rows, [('f'+str(i), c.dtype, c.shape[1:]) for i,c in enumerate(cols)]) | |
86 | for i,c in enumerate(cols): | |
87 | data['f'+str(i)] = c | |
88 | return data | |
89 | ||
90 | def axis_as_object(arr, axis=-1): | |
91 | """ | |
92 | cast the given axis of an array to a void object | |
93 | if the axis to be cast is contiguous, a view is returned, otherwise a copy | |
94 | this is useful for efficiently sorting by the content of an axis, for instance | |
95 | """ | |
96 | shape = arr.shape | |
97 | arr = np.ascontiguousarray(np.swapaxes(arr, axis, -1)) | |
98 | - | adds support for inversion and reduction |
98 | + | |
99 | def object_as_axis(arr, dtype, axis=-1): | |
100 | """cast an array of void objects to a typed axis""" | |
101 | return np.swapaxes(arr.view(dtype).reshape(arr.shape+(-1,)), axis, -1) | |
102 | ||
103 | ||
104 | def array_as_object(arr): | |
105 | - | #find indices which sort the keys; needed later for values |
105 | + | """view everything but the first axis as a void object""" |
106 | arr = arr.reshape(len(arr),-1) | |
107 | return axis_as_object(arr) | |
108 | def array_as_typed(arr, dtype, shape): | |
109 | - | #the slicing points of the bins to reduce over, as required by ufunc.reduceat |
109 | + | """unwrap a void object to its original type and shape""" |
110 | - | self.flag = self.sorted[:-1] != self.sorted[1:] |
110 | + | return object_as_axis(arr, dtype).reshape(shape) |
111 | ||
112 | ##def array_as_struct(arr): | |
113 | ## return np.ascontiguousarray(arr).view([('f0', arr.dtype, arr.shape[1:])])#.flatten() | |
114 | ##def struct_as_array(arr): | |
115 | ## return arr.view(arr['f0'].dtype) | |
116 | ||
117 | ||
118 | - | """ive never run into any use cases for this; but included for backwards compatibility""" |
118 | + | |
119 | ||
120 | ||
121 | """ | |
122 | class hierarchy for indexing a set of keys | |
123 | - | cs = np.cumsum(np.concatenate(([False], self.flag))) |
123 | + | |
124 | - | isorter = np.argsort(self.sorter) |
124 | + | |
125 | - | return cs[isorter] |
125 | + | |
126 | ||
127 | """ | |
128 | A note on naming: 'Index' here refers to the fact that the goal of these classes is to | |
129 | perform and store precomputations on a set of keys, | |
130 | such as to accelerate subsequent operations involving these keys. | |
131 | They are not 'logical' indexes as in pandas; | |
132 | they are not permanently associated with any other data objects | |
133 | ||
134 | Note that these classes are not primarily intended to be used directly from the numpy namespace, | |
135 | but rather are intended for code reuse within a family of higher level operations, | |
136 | only the latter need to be part of the numpy API. | |
137 | ||
138 | - | index object based on lexographic ordering of a set of keys |
138 | + | That said, these classes can also be very useful |
139 | - | we could also handle it as a struct array? |
139 | + | |
140 | saving your from completely reinventing the wheel. | |
141 | """ | |
142 | - | self.keys = tuple(np.asarray(k).flatten() for k in keys) |
142 | + | |
143 | - | #find indices which sort the keys; needed later for values |
143 | + | |
144 | class BaseIndex(object): | |
145 | """ | |
146 | - | self.sorted = tuple(k[self.sorter] for k in self.keys) |
146 | + | |
147 | - | #the slicing points of the bins to reduce over, as required by ufunc.reduceat |
147 | + | |
148 | - | self.flag = reduce( |
148 | + | no grouping, or lex-keys are supported, |
149 | or anything that would require an indirect sort | |
150 | """ | |
151 | def __init__(self, keys): | |
152 | """ | |
153 | keys is a flat array of possibly compsite type | |
154 | """ | |
155 | self.keys = np.asarray(keys).flatten() | |
156 | self.sorted = np.sort(self.keys) | |
157 | #the slicing points of the bins to reduce over | |
158 | self.flag = self.sorted[:-1] != self.sorted[1:] | |
159 | self.slices = np.concatenate(( | |
160 | [0], | |
161 | np.flatnonzero(self.flag)+1, | |
162 | [self.size])) | |
163 | ||
164 | @property | |
165 | def start(self): | |
166 | """start index of all bins""" | |
167 | return self.slices[:-1] | |
168 | @property | |
169 | def stop(self): | |
170 | """stop index of all bins""" | |
171 | return self.slices[1:] | |
172 | - | arr = np.ascontiguousarray(np.rollaxis(arr, axis, -1)) |
172 | + | |
173 | def groups(self): | |
174 | return len(self.start) | |
175 | @property | |
176 | def size(self): | |
177 | - | cast an array of void objects to a typed axis |
177 | + | |
178 | ||
179 | - | return np.rollaxis(arr.view(dtype).reshape(arr.shape+(-1,)), -1, axis) |
179 | + | |
180 | @property | |
181 | def unique(self): | |
182 | """the first entry of each bin is a unique key""" | |
183 | - | given axis of keys array viewed as void object |
183 | + | |
184 | @property | |
185 | def count(self): | |
186 | - | def __init__(self, keys, axis=-1): |
186 | + | |
187 | return np.diff(self.slices) | |
188 | @property | |
189 | - | super(ObjectIndex, self).__init__(axis_as_object(keys, axis)) |
189 | + | |
190 | """returns true if each key occurs an equal number of times""" | |
191 | return not np.any(np.diff(self.count)) | |
192 | ||
193 | ||
194 | - | return object_as_axis(self.sorted, self.dtype, self.axis)[self.start] |
194 | + | |
195 | """ | |
196 | index object over a set of keys | |
197 | - | def as_index(keys, base=False): |
197 | + | adds support for more extensive functionality, notably grouping |
198 | """ | |
199 | - | standard casting of keys to index |
199 | + | |
200 | - | objectIndex is not created by default; only on demand |
200 | + | |
201 | keys is a flat array of possibly composite type | |
202 | """ | |
203 | self.keys = np.asarray(keys) | |
204 | - | if isinstance(keys, np.ndarray): |
204 | + | #find indices which sort the keys |
205 | self.sorter = np.argsort(self.keys) | |
206 | #computed sorted keys | |
207 | self.sorted = self.keys[self.sorter] | |
208 | #the slicing points of the bins to reduce over | |
209 | self.flag = self.sorted[:-1] != self.sorted[1:] | |
210 | - | return LexIndex(keys) |
210 | + | |
211 | [0], | |
212 | np.flatnonzero(self.flag)+1, | |
213 | [self.size])) | |
214 | ||
215 | @property | |
216 | def index(self): | |
217 | - | standard API starts here |
217 | + | """ive never run into any use cases for this; |
218 | perhaps it was intended to be used to do group_by(keys).first(values)? | |
219 | in any case, included for backwards compatibility with np.unique""" | |
220 | return self.sorter[self.start] | |
221 | @property | |
222 | def rank(self): | |
223 | """how high in sorted list each key is""" | |
224 | return self.sorter.argsort() | |
225 | @property | |
226 | - | not sure it should; more cleanly written externally i think |
226 | + | def sorted_group_rank_per_key(self): |
227 | """find a better name for this? enumeration of sorted keys. also used in median implementation""" | |
228 | return np.cumsum(np.concatenate(([False], self.flag))) | |
229 | @property | |
230 | def inverse(self): | |
231 | - | self.index = as_index(keys) |
231 | + | |
232 | return self.sorted_group_rank_per_key[self.rank] | |
233 | - | #forward interesting index properties |
233 | + | |
234 | - | ## @property |
234 | + | |
235 | - | ## def keys(self): |
235 | + | |
236 | - | ## #this alias makes sense in the context of a group, no? |
236 | + | |
237 | - | ## return self.index.unique |
237 | + | |
238 | ## """ | |
239 | ## return np.searchsorted(self.unique, other) | |
240 | ||
241 | ||
242 | ||
243 | class ObjectIndex(Index): | |
244 | """ | |
245 | given axis enumerates the keys | |
246 | all other axes form the keys | |
247 | groups will be formed on the basis of bitwise equality | |
248 | ||
249 | should we retire objectindex? | |
250 | this can be integrated with regular index ala lexsort, no? | |
251 | not sure what is more readable though | |
252 | """ | |
253 | def __init__(self, keys, axis): | |
254 | - | this will only produce correct results if self.uniform==True |
254 | + | |
255 | - | checking this is left to the user! |
255 | + | |
256 | ||
257 | keys = np.swapaxes(keys, axis, 0) | |
258 | - | return values.reshape(self.index.groups, self.index.count[0], *values.shape[1:]) |
258 | + | self.shape = keys.shape |
259 | keys = array_as_object(keys) | |
260 | ||
261 | super(ObjectIndex, self).__init__(keys) | |
262 | ||
263 | @property | |
264 | - | return self.as_array(values) if self.index.uniform else self.as_list(values) |
264 | + | |
265 | """the first entry of each bin is a unique key""" | |
266 | - | #not sure how i feel about this. explicit is better than implict |
266 | + | sorted = array_as_typed(self.sorted, self.dtype, self.shape) |
267 | - | #but group_by(keys).group(values) does not sit too well with me either |
267 | + | return np.swapaxes(sorted[self.start], self.axis, 0) |
268 | ||
269 | ||
270 | class LexIndex(Index): | |
271 | """ | |
272 | index object based on lexographic ordering of a tuple of key-arrays | |
273 | key arrays can be any type, including multi-dimensional, structed or voidobjects | |
274 | however, passing such fancy keys to lexindex may not be ideal from a performance perspective, | |
275 | as lexsort does not accept them as arguments directly, so we have to index and invert them first | |
276 | ||
277 | should you find yourself with such complex keys, it may be more efficient | |
278 | to place them into a structured array first | |
279 | ||
280 | note that multidimensional columns are indexed by their first column, | |
281 | and no per-column axis keyword is supplied, | |
282 | customization of column layout will have to be done at the call site | |
283 | - | func = lambda slc: operator.reduceat(slc, self.index.start) |
283 | + | |
284 | - | return np.apply_along_axis(func, 0, values) |
284 | + | |
285 | def __init__(self, keys): | |
286 | self.keys = tuple(np.asarray(key) for key in keys) | |
287 | self.dtypes = tuple(key.dtype for key in self.keys) | |
288 | - | #a bunch of common reduction operations |
288 | + | self.shapes = tuple(key.shape for key in self.keys) |
289 | ||
290 | keyviews = tuple(array_as_object(key) if key.ndim>1 else key for key in self.keys) | |
291 | #find indices which sort the keys; complex keys which lexsort does not accept are bootstrapped from Index | |
292 | self.sorter = np.lexsort(tuple(Index(key).inverse if key.dtype.kind is 'V' else key for key in keyviews)) | |
293 | #computed sorted keys | |
294 | self.sorted = tuple(key[self.sorter] for key in keyviews) | |
295 | #the slicing points of the bins to reduce over | |
296 | self.flag = reduce( | |
297 | np.logical_or, | |
298 | (s[:-1] != s[1:] for s in self.sorted)) | |
299 | - | count = self.index.count.reshape(-1,*(1,)*(values.ndim-1)) |
299 | + | |
300 | [0], | |
301 | np.flatnonzero(self.flag)+1, | |
302 | [self.size])) | |
303 | ||
304 | @property | |
305 | def unique(self): | |
306 | - | count = self.index.count.reshape(-1,*(1,)*(values.ndim-1)) |
306 | + | """returns a tuple of unique key columns""" |
307 | return tuple( | |
308 | (array_as_typed(s, dtype, shape) if len(shape)>1 else s)[self.start] | |
309 | for s, dtype, shape in zip(self.sorted, self.dtypes, self.shapes)) | |
310 | ||
311 | @property | |
312 | def size(self): | |
313 | - | return np.unique, np.sqrt(self.var(values, axis)[1]) |
313 | + | |
314 | ||
315 | class LexIndexSimple(Index): | |
316 | - | """compute the median value over each group. pretty clever, if i may say so myself""" |
316 | + | |
317 | simplified LexIndex, which only accepts 1-d arrays of simple dtypes | |
318 | the more expressive LexIndex only has some logic overhead, | |
319 | in case all columns are indeed simple. not sure this is worth the extra code | |
320 | """ | |
321 | def __init__(self, keys): | |
322 | - | #place values at correct keys |
322 | + | self.keys = tuple(np.asarray(key) for key in keys) |
323 | self.sorter = np.lexsort(self.keys) | |
324 | - | #refine sorting within each keygroup |
324 | + | |
325 | - | sorter = np.lexsort((slc, self.index.sorted)) |
325 | + | self.sorted = tuple(key[self.sorter] for key in self.keys) |
326 | #the slicing points of the bins to reduce over | |
327 | self.flag = reduce( | |
328 | np.logical_or, | |
329 | (s[:-1] != s[1:] for s in self.sorted)) | |
330 | self.slices = np.concatenate(( | |
331 | [0], | |
332 | np.flatnonzero(self.flag)+1, | |
333 | [self.size])) | |
334 | ||
335 | @property | |
336 | def unique(self): | |
337 | """the first entry of each bin is a unique key""" | |
338 | return tuple(s[self.start] for s in self.sorted) | |
339 | ||
340 | @property | |
341 | def size(self): | |
342 | return self.sorter.size | |
343 | ||
344 | ||
345 | ||
346 | def as_index(keys, axis = axis_default, base=False): | |
347 | """ | |
348 | casting rules for a keys object to an index | |
349 | ||
350 | the preferred semantics is that keys is a sequence of key objects, | |
351 | except when keys is an instance of tuple, | |
352 | in which case the zipped elements of the tuple are the key objects | |
353 | ||
354 | the axis keyword specifies the axis which enumerates the keys | |
355 | if axis is None, the keys array is flattened | |
356 | if axis is 0, the first axis enumerates the keys | |
357 | which of these two is the default depends on whether backwards_compatible == True | |
358 | ||
359 | if base==True, the most basic index possible is constructed. | |
360 | this avoids an indirect sort; if it isnt required, this has better performance | |
361 | """ | |
362 | if isinstance(keys, Index): | |
363 | return keys | |
364 | if isinstance(keys, tuple): | |
365 | return LexIndex(keys) | |
366 | try: | |
367 | keys = np.asarray(keys) | |
368 | except: | |
369 | raise TypeError('Given object does not form a valid set of keys') | |
370 | if axis is None: | |
371 | keys = keys.flatten() | |
372 | if keys.ndim==1: | |
373 | if base: | |
374 | return BaseIndex(keys) | |
375 | else: | |
376 | return Index(keys) | |
377 | - | def unique(keys, return_index = False, return_inverse = False, return_count = False): |
377 | + | else: |
378 | return ObjectIndex(keys, axis) | |
379 | - | equivalent interface to numpy.unique |
379 | + | |
380 | - | i believe the kwargs should be deprecated though |
380 | + | |
381 | ||
382 | - | should standard interface not suffice |
382 | + | |
383 | ||
384 | - | index = as_index(keys, base = not (return_index or return_inverse)) |
384 | + | |
385 | public API starts here | |
386 | """ | |
387 | ||
388 | ||
389 | class GroupBy(object): | |
390 | """ | |
391 | supports ufunc reduction | |
392 | should any other form of reduction be supported? | |
393 | not sure it should; more cleanly written externally i think, on a grouped iterables | |
394 | """ | |
395 | - | def count(keys): |
395 | + | def __init__(self, keys, axis = 0): |
396 | - | """numpy equivalent of collections.Counter""" |
396 | + | |
397 | - | index = as_index(keys, base = True) |
397 | + | |
398 | # note that we dont have backwards compatibility issues with groupby, | |
399 | #so we are going to use axis = 0 as a default | |
400 | - | def multiplicity(keys): |
400 | + | #the multi-dimensional structure of a keys object is usualy meaningfull, |
401 | #and value arguments are not automatically flattened either | |
402 | ||
403 | self.index = as_index(keys, axis) | |
404 | ||
405 | #forward interesting/'public' index properties | |
406 | @property | |
407 | - | index = as_index(keys) |
407 | + | |
408 | return self.index.unique | |
409 | @property | |
410 | def count(self): | |
411 | return self.index.count | |
412 | @property | |
413 | - | given an Nxm matrix containing boundary info between simplices |
413 | + | |
414 | - | , compute indidence info matrix |
414 | + | |
415 | @property | |
416 | def rank(self): | |
417 | return self.index.rank | |
418 | ||
419 | ||
420 | #some different methods of chopping up a set of values by key | |
421 | #not sure they are all equally relevant, but i actually have real world use cases for most of them | |
422 | ||
423 | - | keys = np.array(["e", "b", "b", "c", "d", "e", "e", 'a']) |
423 | + | def as_iterable_from_iterable(self, values): |
424 | - | values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1]) |
424 | + | |
425 | grouping of an iterable. memory consumption depends on the amount of sorting required | |
426 | worst case, if index.sorter[-1] = 0, we need to consume the entire value iterable, | |
427 | before we can start yielding any output | |
428 | but to the extent that the keys come presorted, the grouping is lazy | |
429 | """ | |
430 | values = iter(enumerate(values)) | |
431 | cache = dict() | |
432 | def get_value(ti): | |
433 | try: | |
434 | return cache.pop(ti) | |
435 | except: | |
436 | while True: | |
437 | i, v = next(values) | |
438 | if i==ti: | |
439 | return v | |
440 | cache[i] = v | |
441 | s = iter(self.index.sorter) | |
442 | for c in self.count: | |
443 | yield (get_value(i) for i in itertools.islice(s, c)) | |
444 | ||
445 | def as_outoforder(self, values): | |
446 | - | reduced_values = g.reduce(values, np.minimum) |
446 | + | |
447 | group values, without regard for the ordering of self.index.unique | |
448 | consume values as they come, and yield key-group pairs as soon as they complete | |
449 | thi spproach is lazy, insofar as grouped values are close in their iterable | |
450 | """ | |
451 | from collections import defaultdict | |
452 | cache = defaultdict(list) | |
453 | count = self.count | |
454 | unique = self.unique | |
455 | key = (lambda i: unique[i]) if isinstance(unique, np.ndarray) else (lambda i: tuple(c[i] for c in unique)) | |
456 | for i,v in itertools.izip(self.inverse, values): | |
457 | cache[i].append(v) | |
458 | if len(cache[i]) == count[i]: | |
459 | - | normal index operates on a flat array |
459 | + | yield key(i), cache.pop(i) |
460 | - | but we may operate on struct arrays, axes of a given array, or a sequence of keys as well |
460 | + | |
461 | def as_iterable_from_sequence(self, values): | |
462 | - | keys = np.random.randint(0, 2, (100,3)).astype(np.int8) |
462 | + | |
463 | - | values = np.random.randint(-1,2,(100,4)) |
463 | + | this is the preferred method if values has random access, |
464 | but we dont want it completely in memory. | |
465 | - | struct_keys = np.zeros(len(keys), dtype='3int8, float32') |
465 | + | like a big memory mapped file, for instance |
466 | - | struct_keys['f0'] = keys |
466 | + | |
467 | s = iter(self.index.sorter) | |
468 | - | #all these various datastructures should produce the same indexing behavior |
468 | + | for c in self.count: |
469 | - | assert(np.all( |
469 | + | yield (values[i] for i in itertools.islice(s, c)) |
470 | - | multiplicity(ObjectIndex(keys)) == #void object indexing |
470 | + | |
471 | - | multiplicity(tuple(keys.T)))) #lexographic indexing |
471 | + | |
472 | - | assert(np.all( |
472 | + | |
473 | - | multiplicity(ObjectIndex(keys)) == #void object indexing |
473 | + | |
474 | - | multiplicity(struct_keys))) #struct array indexing |
474 | + | |
475 | this is only possible if index.uniform==True | |
476 | - | #make our second field relevant |
476 | + | |
477 | - | struct_keys['f1'][0] = 3.1415 |
477 | + | assert(self.index.uniform) |
478 | - | unique_keys, reduced_values = group_by(struct_keys).sum(values) |
478 | + | |
479 | - | print 'sum per group of identical rows' |
479 | + | |
480 | - | for uk in unique_keys: print uk |
480 | + | return values.reshape(self.index.groups, self.count[0], *values.shape[1:]) |
481 | ||
482 | def as_list(self, values): | |
483 | - | ##test_fancy_keys() |
483 | + | |
484 | values = np.asarray(values) | |
485 | values = values[self.index.sorter] | |
486 | return np.split(values, self.index.slices[1:-1], axis=0) | |
487 | ||
488 | def group(self, values): | |
489 | try: | |
490 | return self.as_array(values) | |
491 | except: | |
492 | return self.as_list(values) | |
493 | ||
494 | def __call__(self, values): | |
495 | """ | |
496 | not sure how i feel about this. explicit is better than implict | |
497 | but group_by(keys).group(values) does not sit too well with me either | |
498 | """ | |
499 | return self.unique, self.group(values) | |
500 | ||
501 | ||
502 | # ufunc based reduction methods | |
503 | - | sample = np.random.poisson(distribution*100+10).astype(np.float) |
503 | + | |
504 | def reduce(self, values, operator = np.add): | |
505 | """ | |
506 | - | print 'is this an airy or gaussian function? hard to tell with all this noise!' |
506 | + | |
507 | - | pp.imshow(sample, interpolation='nearest') |
507 | + | |
508 | all other axes are treated indepenently for the sake of this reduction | |
509 | note that this code is only C-vectorized over the first axis | |
510 | - | print 'if we are sampling an airy function, you will see a small but significant rise around x=1' |
510 | + | |
511 | - | pp.plot(*group_by(np.round(R, 5)).mean(sample.flatten())) |
511 | + | |
512 | as_list may be preferable | |
513 | """ | |
514 | values = values[self.index.sorter] | |
515 | - | test_radial() |
515 | + | |
516 | - | quit() |
516 | + | return np.apply_along_axis( |
517 | lambda slc: operator.reduceat(slc, self.index.start), | |
518 | 0, values) | |
519 | else: | |
520 | return operator.reduceat(values, self.index.start) | |
521 | ||
522 | def sum(self, values, axis=0): | |
523 | - | points = np.random.random((20,2))*2-1 |
523 | + | |
524 | values = np.asarray(values) | |
525 | if axis: values = np.rollaxis(values, axis) | |
526 | return self.unique, self.reduce(values) | |
527 | ||
528 | def mean(self, values, axis=0): | |
529 | """compute the mean over each group""" | |
530 | values = np.asarray(values) | |
531 | if axis: values = np.rollaxis(values, axis) | |
532 | count = self.count.reshape(-1,*(1,)*(values.ndim-1)) | |
533 | return self.unique, self.reduce(values) / count | |
534 | ||
535 | def var(self, values, axis=0): | |
536 | """compute the variance over each group""" | |
537 | values = np.asarray(values) | |
538 | - | index = ObjectIndex(sorted_edges) |
538 | + | |
539 | - | boundary_edges = edges[multiplicity(index)==1] |
539 | + | count = self.count.reshape(-1,*(1,)*(values.ndim-1)) |
540 | mean = self.reduce(values) / count | |
541 | err = values - mean[self.inverse] | |
542 | return self.unique, self.reduce(err**2) / count | |
543 | ||
544 | def std(self, values, axis=0): | |
545 | - | print len( incidence(index)) |
545 | + | |
546 | - | inverse = index.inverse |
546 | + | unique, var = self.var(values, axis) |
547 | - | FE = inverse.reshape(-1,3) |
547 | + | return unique, np.sqrt(var) |
548 | - | print FE |
548 | + | |
549 | - | print incidence(FE) |
549 | + | |
550 | """ | |
551 | compute the median value over each group. | |
552 | when average is true, the average is the two cental values is taken | |
553 | for groups with an even key-count | |
554 | - | face_values = np.random.random(d.nsimplex) |
554 | + | |
555 | values = np.asarray(values) | |
556 | ||
557 | - | #in order to smooth, we need to map from faces to vertices, and back again |
557 | + | |
558 | - | #mapping from vertices to faces is simple; the converse not so much. |
558 | + | |
559 | - | #yet it can be done concisely and efficiently using a group reduction operation |
559 | + | |
560 | ||
561 | #need this indirection for lex-index compatibility | |
562 | - | #toggle to see the effect of the median filter |
562 | + | sorted_group_rank_per_key = self.index.sorted_group_rank_per_key |
563 | - | g = group_by(tris) |
563 | + | |
564 | def median1d(slc): | |
565 | - | _, vertex_values = prestep(np.repeat(face_values, 3)) |
565 | + | #place values at correct keys; preconditions the upcoming lexsort |
566 | slc = slc[self.index.sorter] | |
567 | #refine value sorting within each keygroup | |
568 | sorter = np.lexsort((slc, sorted_group_rank_per_key)) | |
569 | slc = slc[sorter] | |
570 | return (slc[lo]+slc[hi]) / 2 if average else slc[hi] | |
571 | ||
572 | values = np.asarray(values) | |
573 | if axis: values = np.rollaxis(values, axis) | |
574 | if values.ndim>1: #is trying to skip apply_along_axis somewhat premature optimization? | |
575 | values = np.apply_along_axis(median1d, 0, values) | |
576 | else: | |
577 | values = median1d(values) | |
578 | return self.unique, values | |
579 | ||
580 | def min(self, values, axis=0): | |
581 | """return the minimum within each group""" | |
582 | values = np.asarray(values) | |
583 | if axis: values = np.rollaxis(values, axis) | |
584 | return self.unique, self.reduce(values, np.minimum) | |
585 | ||
586 | def max(self, values, axis=0): | |
587 | """return the maximum within each group""" | |
588 | values = np.asarray(values) | |
589 | if axis: values = np.rollaxis(values, axis) | |
590 | return self.unique, self.reduce(values, np.maximum) | |
591 | ||
592 | def first(self, values, axis=0): | |
593 | """return values at first occurance of its associated key""" | |
594 | values = np.asarray(values) | |
595 | if axis: values = np.rollaxis(values, axis) | |
596 | return self.unique, values[self.index.sorter[self.index.start]] | |
597 | ||
598 | def last(self, values, axis=0): | |
599 | """return values at last occurance of its associated key""" | |
600 | values = np.asarray(values) | |
601 | if axis: values = np.rollaxis(values, axis) | |
602 | return self.unique, values[self.index.sorter[self.index.stop-1]] | |
603 | ||
604 | ||
605 | ||
606 | #implement iter interface? could simply do zip( group_by(keys)(values)), no? | |
607 | ||
608 | #just an alias, for those who dont like camelcase | |
609 | group_by = GroupBy | |
610 | #could also turn this into a function with optional values and reduction func. | |
611 | def group_by(keys, values=None, reduction=None, axis=0): | |
612 | g = GroupBy(keys, axis) | |
613 | if values is None: | |
614 | return g | |
615 | groups = g.group(values) | |
616 | if reduction is None: | |
617 | return groups | |
618 | return [reduction(group) for group in groups] | |
619 | ||
620 | ||
621 | ||
622 | """ | |
623 | some demonstration of the duplicity between this code and np.unique | |
624 | they share a lot of common functionality | |
625 | we cant quite bootstrap grouping from unique as is | |
626 | but unique can easily be reimplemented using the Index class | |
627 | """ | |
628 | ||
629 | def unique(keys, return_index = False, return_inverse = False, return_count = False, axis = axis_default): | |
630 | """ | |
631 | backwards compatible interface with numpy.unique | |
632 | in the long term i think the kwargs should be deprecated though | |
633 | cleaner to call index and its properties directly, | |
634 | should you want something beyond the interface provided | |
635 | """ | |
636 | index = as_index(keys, axis, base = not (return_index or return_inverse)) | |
637 | ||
638 | ret = index.unique, | |
639 | if return_index: | |
640 | ret = ret + (index.index,) | |
641 | if return_inverse: | |
642 | ret = ret + (index.inverse,) | |
643 | if return_count: | |
644 | ret = ret + (index.count,) | |
645 | return ret[0] if len(ret) == 1 else ret | |
646 | ||
647 | def count(keys, axis = axis_default): | |
648 | """numpy work-alike of collections.Counter""" | |
649 | index = as_index(keys, axis, base = True) | |
650 | return index.unique, index.count | |
651 | ||
652 | def multiplicity(keys, axis = axis_default): | |
653 | """ | |
654 | return the multiplicity of each key, or how often it occurs in the set | |
655 | note that we are not interested in the unique values for this operation, | |
656 | casting doubt on the current numpy design which places np.unique | |
657 | at the center of the interface for this kind of operation | |
658 | given how often i use multiplicity, id like to have it in the numpy namespace | |
659 | """ | |
660 | index = as_index(keys, axis) | |
661 | return index.count[index.inverse] | |
662 | ||
663 | def rank(keys, axis = axis_default): | |
664 | """ | |
665 | where each item is in the pecking order. | |
666 | not sure this should be part of the public api, cant think of any use-case right away | |
667 | plus, we have a namespace conflict | |
668 | """ | |
669 | index = as_index(keys, axis) | |
670 | return index.rank | |
671 | ||
672 | ||
673 | def incidence(boundary): | |
674 | """ | |
675 | given an Nxm matrix containing boundary info between simplices, | |
676 | compute indidence info matrix | |
677 | not to be part of numpy API, to be sure, just something im playing with | |
678 | """ | |
679 | return group_by(boundary).group(np.arange(boundary.size) // boundary.shape[1]) | |
680 | ||
681 | ||
682 | ||
683 | ||
684 | def test_basic(): | |
685 | ||
686 | keys = ["e", "b", "b", "c", "d", "e", "e", 'a'] | |
687 | values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1] | |
688 | ||
689 | ## keys = np.array(["a", "b", "b", "c", "d", "e", "e",'d','a','c']) | |
690 | ## values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01, 1,2,3]) | |
691 | ||
692 | print 'two methods of computing non-reducing group' | |
693 | print 'as iterable' | |
694 | g = group_by(keys) | |
695 | for k,v in zip(g.unique, g.as_iterable_from_sequence(values)): | |
696 | print k, list(v) | |
697 | print 'as list' | |
698 | for k,v in zip(*group_by(keys)(values)): | |
699 | print k, v | |
700 | ||
701 | print 'some reducing group operations' | |
702 | g = group_by(keys) | |
703 | unique_keys, reduced_values = g.median(values) | |
704 | print 'per group median' | |
705 | print reduced_values | |
706 | unique_keys, reduced_values = g.mean(values) | |
707 | print 'per group mean' | |
708 | print reduced_values | |
709 | unique_keys, reduced_values = g.std(values) | |
710 | print 'per group std' | |
711 | print reduced_values | |
712 | reduced_values = g.reduce(np.array(values), np.minimum) #alternate way of calling | |
713 | print 'per group min' | |
714 | print reduced_values | |
715 | unique_keys, reduced_values = g.max(values) | |
716 | print 'per group max' | |
717 | print reduced_values | |
718 | ||
719 | ||
720 | ##test_basic() | |
721 | ##quit() | |
722 | ||
723 | ||
724 | def test_lex_median(): | |
725 | """ | |
726 | for making sure i squased all bugs related to fancy-keys and median filter implementation | |
727 | """ | |
728 | keys1 = ["e", "b", "b", "c", "d", "e", "e", 'a'] | |
729 | keys2 = ["b", "b", "b", "d", "e", "e", 'e', 'e'] | |
730 | ## keys3 = np.random.randint(0,2,(8,2)) | |
731 | values = [1.2, 4.5, 4.3, 2.0, 5.6, 8.8, 9.1, 1] | |
732 | ||
733 | unique, median = group_by((keys1, keys2)).median(values) | |
734 | for i in zip(zip(*unique), median): | |
735 | print i | |
736 | ||
737 | ##test_lex_median() | |
738 | ##quit() | |
739 | ||
740 | ||
741 | def test_fancy_keys(): | |
742 | """ | |
743 | test Index subclasses | |
744 | """ | |
745 | keys = np.random.randint(0, 2, (20,3)).astype(np.int8) | |
746 | values = np.random.randint(-1,2,(20,4)) | |
747 | ||
748 | ||
749 | #all these various datastructures should produce the same behavior | |
750 | #multiplicity is a nice unit test, since it draws on most of the low level functionality | |
751 | if backwards_compatible: | |
752 | assert(np.all( | |
753 | multiplicity(keys, axis=0) == #void object indexing | |
754 | multiplicity(tuple(keys.T)))) #lexographic indexing | |
755 | assert(np.all( | |
756 | multiplicity(keys, axis=0) == #void object indexing | |
757 | multiplicity(as_struct_array(keys)))) #struct array indexing | |
758 | else: | |
759 | assert(np.all( | |
760 | multiplicity(keys) == #void object indexing | |
761 | multiplicity(tuple(keys.T)))) #lexographic indexing | |
762 | assert(np.all( | |
763 | multiplicity(keys) == #void object indexing | |
764 | multiplicity(as_struct_array(keys)))) #struct array indexing | |
765 | ||
766 | #lets go mixing some dtypes! | |
767 | floatkeys = np.zeros(len(keys)) | |
768 | floatkeys[0] = 8.8 | |
769 | print 'sum per group of identical rows using struct key' | |
770 | g = group_by(as_struct_array(keys, floatkeys)) | |
771 | for e in zip(g.count, *g.sum(values)): | |
772 | print e | |
773 | print 'sum per group of identical rows using lex of nd-key' | |
774 | g = group_by(( keys, floatkeys)) | |
775 | for e in zip(zip(*g.unique), g.sum(values)[1]): | |
776 | print e | |
777 | print 'sum per group of identical rows using lex of struct key' | |
778 | g = group_by((as_struct_array( keys), floatkeys)) | |
779 | for e in zip(zip(*g.unique), g.sum(values)[1]): | |
780 | print e | |
781 | ||
782 | #showcase enhanced unique functionality | |
783 | images = np.random.rand(4,4,4) | |
784 | #shuffle the images; this is a giant mess now; how to find the unique ones? | |
785 | shuffled = images[np.random.randint(0,4,200)] | |
786 | #there you go | |
787 | if backwards_compatible: | |
788 | print unique(shuffled, axis=0) | |
789 | else: | |
790 | print unique(shuffled) | |
791 | ||
792 | ||
793 | ##g = test_fancy_keys() | |
794 | ##quit() | |
795 | ||
796 | ||
797 | def test_radial(): | |
798 | x = np.linspace(-2,2, 64) | |
799 | y = x[:, None] | |
800 | x = x[None, :] | |
801 | R = np.sqrt( x**2+y**2) | |
802 | ||
803 | def airy(r, sigma): | |
804 | from scipy.special import j1 | |
805 | r = r / sigma * np.sqrt(2) | |
806 | a = (2*j1(r)/r)**2 | |
807 | a[r==0] = 1 | |
808 | return a | |
809 | def gauss(r, sigma): | |
810 | return np.exp(-(r/sigma)**2) | |
811 | ||
812 | distribution = np.random.choice([gauss, airy])(R, 0.3) | |
813 | sample = np.random.poisson(distribution*200+10).astype(np.float) | |
814 | ||
815 | import matplotlib.pyplot as pp | |
816 | #is this an airy or gaussian function? hard to tell with all this noise! | |
817 | pp.imshow(sample, interpolation='nearest', cmap='gray') | |
818 | pp.show() | |
819 | #radial reduction to the rescue! | |
820 | #if we are sampling an airy function, you will see a small but significant rise around x=1 | |
821 | g = group_by(np.round(R, 5).flatten()) | |
822 | pp.errorbar( | |
823 | g.unique, | |
824 | g.mean(sample.flatten())[1], | |
825 | g.std (sample.flatten())[1] / np.sqrt(g.count)) | |
826 | pp.xlim(0,2) | |
827 | pp.show() | |
828 | ||
829 | ##test_radial() | |
830 | ##quit() | |
831 | ||
832 | def test_meshing(): | |
833 | """ | |
834 | meshing example | |
835 | demonstrates the use of multiplicity, and group.median | |
836 | """ | |
837 | #set up some random points, and get their delaunay triangulation | |
838 | points = np.random.random((20000,2))*2-1 | |
839 | points = points[np.linalg.norm(points,axis=1) < 1] | |
840 | from scipy.spatial.qhull import Delaunay | |
841 | d = Delaunay(points) | |
842 | tris = d.simplices | |
843 | ||
844 | #the operations provided in this module allow us to express potentially complex | |
845 | #computational geometry questions elegantly in numpy | |
846 | #Delaunay.neighbors could be used as well, | |
847 | #but the point is to express this in pure numpy, without additional library functionaoty | |
848 | edges = tris[:,[[0,1],[1,2],[2,0]]].reshape(-1,2) | |
849 | sorted_edges = np.where(edges[:,0:1]<edges[:,1:2], edges, edges[:,::-1]) | |
850 | #we test to see how often each edge occurs, or how many indicent simplices it has | |
851 | #this is a very general method of finding the boundary of any topology | |
852 | #and we can do so here with only one simple and readable command, multiplicity == 1 | |
853 | if backwards_compatible: | |
854 | boundary_edges = edges[multiplicity(sorted_edges, axis=0)==1] | |
855 | else: | |
856 | boundary_edges = edges[multiplicity(sorted_edges)==1] | |
857 | boundary_points = unique(boundary_edges) | |
858 | ||
859 | if False: | |
860 | print boundary_edges | |
861 | print incidence(boundary_edges) | |
862 | ||
863 | ||
864 | #create some random values on faces | |
865 | #we want to smooth them over the mesh to create a nice hilly landscape | |
866 | face_values = np.random.normal(size=d.nsimplex) | |
867 | #add some salt and pepper noise, to make our problem more interesting | |
868 | face_values[np.random.randint(d.nsimplex, size=10)] += 1000 | |
869 | ||
870 | #start with a median step, to remove salt-and-pepper noise | |
871 | #toggle to mean to see the effect of the median filter | |
872 | g = group_by(tris.flatten()) | |
873 | prestep = g.median if True else g.mean | |
874 | vertex_values = prestep(np.repeat(face_values, 3))[1] | |
875 | vertex_values[boundary_points] = 0 | |
876 | ||
877 | #actually, we can compute the mean without grouping | |
878 | tris_per_vert = g.count | |
879 | def scatter(x): | |
880 | r = np.zeros(d.npoints, x.dtype) | |
881 | for idx in tris.T: np.add.at(r, idx, x) | |
882 | return r / tris_per_vert | |
883 | def gather(x): | |
884 | return x[tris].mean(axis=1) | |
885 | ||
886 | #iterate a little | |
887 | for i in range(100): | |
888 | face_values = gather(vertex_values) | |
889 | vertex_values = scatter(face_values) | |
890 | vertex_values[boundary_points] = 0 | |
891 | ||
892 | ||
893 | #display our nicely rolling hills and their boundary | |
894 | import matplotlib.pyplot as plt | |
895 | x, y = points.T | |
896 | plt.tripcolor(x,y, triangles = tris, facecolors = face_values) | |
897 | plt.scatter(x[boundary_points], y[boundary_points]) | |
898 | plt.xlim(-1,1) | |
899 | plt.ylim(-1,1) | |
900 | plt.axis('equal') | |
901 | plt.show() | |
902 | ||
903 | d = test_meshing() |