SHOW:
|
|
- or go back to the newest paste.
1 | """ | |
2 | note on np.unique | |
3 | ||
4 | this module expands on a the functionality one would currently bootstrap from np.unique | |
5 | to facilitate the functionality implemented in this module, a generalization of unique is proposed | |
6 | that is, the version of unique implemented here also takes a lex-sortable tuple, | |
7 | or multidimensional array viewed along one of its axes as a valid input; ie: | |
8 | ||
9 | #stack of 4 images | |
10 | images = np.random.rand(4,64,64) | |
11 | #shuffle the images; this is a giant mess now; how to find the unique ones? | |
12 | shuffled = images[np.random.randint(0,4,200)] | |
13 | #there you go | |
14 | print unique(shuffled, axis=0) | |
15 | ||
16 | furthermore, some np.unique related functions are proposed, with a more friendly interface | |
17 | (multiplicity, count) | |
18 | """ | |
19 | ||
20 | ||
21 | """ | |
22 | A note on pandas | |
23 | This module has substantial overlap with pandas' grouping functionality. | |
24 | So whats the reason for implementing it in numpy? | |
25 | ||
26 | Primarily; the concept of grouping is far more general than pandas' dataframe. | |
27 | There is no reason why numpy ndarrays should not have a solid core of grouping functionality. | |
28 | ||
29 | The recently added ufunc support make that we can now express grouping operations in pure numpy; | |
30 | that is, without any slow python loops or cumbersome C-extensions. | |
31 | ||
32 | I was not aware of pandas functionalty on this front when starting working on it; | |
33 | so i am somewhat pleased to say that many core design elements have turned out the same. | |
34 | I take that to mean these design decision probably make sense. | |
35 | ||
36 | It does raise the question as to where the proper line between pandas and numpy lies. | |
37 | I would argue that evidently, most of pandas functionality has no place in numpy. | |
38 | Then how is grouping different? I feel what lies at the heart of pandas is a permanent conceptual | |
39 | association between various pieces of data, assembled in a dataframe and all its metadata. | |
40 | I think numpy ought to stay well clear of that. | |
41 | On the other hand, you dont want want to go around creating a pandas dataframe | |
42 | just to plot a radial reduction; These kind of transient single-statement associations | |
43 | between keys and values are very useful, entirely independently of some heavyweight framework | |
44 | ||
45 | Further questions raised from pandas: should we have some form of merge/join functionality too? | |
46 | or is this getting too panda-ey? all the use cases i can think of fail to be pressed | |
47 | in some kind of standard mould, but i might be missing something here | |
48 | """ | |
49 | ||
50 | ||
51 | import numpy as np | |
52 | import itertools | |
53 | ||
54 | ||
55 | ||
56 | ||
57 | ||
58 | ||
59 | """ | |
60 | this toggle switches between preferred or backwards compatible semantics | |
61 | for dealing with key objects. current behavior for the arguments to functions | |
62 | like np.unique is to flatten any input arrays. | |
63 | i think a more unified semantics is achieved by interpreting all key arguments as | |
64 | sequences of key objects, whereby non-flat arrays are simply sequences of keys, | |
65 | whereby the keys themselves are ndim-1-arrays | |
66 | for reasons of backwards compatibility, it is probably wise to retain the default, | |
67 | but at least an axis keyword to toggle this behavior would be a welcome addition | |
68 | """ | |
69 | backwards_compatible = False | |
70 | if backwards_compatible: | |
71 | axis_default = None | |
72 | else: | |
73 | axis_default = 0 | |
74 | ||
75 | ||
76 | ||
77 | """ | |
78 | some utility functions | |
79 | """ | |
80 | ||
81 | def as_struct_array(*cols): | |
82 | """pack a bunch of columns as a struct""" | |
83 | cols = [np.asarray(c) for c in cols] | |
84 | rows = len(cols[0]) | |
85 | data = np.empty(rows, [('f'+str(i), c.dtype, c.shape[1:]) for i,c in enumerate(cols)]) | |
86 | for i,c in enumerate(cols): | |
87 | data['f'+str(i)] = c | |
88 | return data | |
89 | ||
90 | def axis_as_object(arr, axis=-1): | |
91 | """ | |
92 | cast the given axis of an array to a void object | |
93 | if the axis to be cast is contiguous, a view is returned, otherwise a copy | |
94 | this is useful for efficiently sorting by the content of an axis, for instance | |
95 | """ | |
96 | shape = arr.shape | |
97 | arr = np.ascontiguousarray(np.swapaxes(arr, axis, -1)) | |
98 | return arr.view(np.dtype((np.void, arr.dtype.itemsize * shape[axis]))).reshape(np.delete(shape, axis)) | |
99 | def object_as_axis(arr, dtype, axis=-1): | |
100 | """cast an array of void objects to a typed axis""" | |
101 | return np.swapaxes(arr.view(dtype).reshape(arr.shape+(-1,)), axis, -1) | |
102 | ||
103 | ||
104 | def array_as_object(arr): | |
105 | """view everything but the first axis as a void object""" | |
106 | arr = arr.reshape(len(arr),-1) | |
107 | return axis_as_object(arr) | |
108 | def array_as_typed(arr, dtype, shape): | |
109 | """unwrap a void object to its original type and shape""" | |
110 | return object_as_axis(arr, dtype).reshape(shape) | |
111 | ||
112 | ##def array_as_struct(arr): | |
113 | ## return np.ascontiguousarray(arr).view([('f0', arr.dtype, arr.shape[1:])])#.flatten() | |
114 | ##def struct_as_array(arr): | |
115 | ## return arr.view(arr['f0'].dtype) | |
116 | ||
117 | ||
118 | ||
119 | ||
120 | ||
121 | """ | |
122 | class hierarchy for indexing a set of keys | |
123 | the class hierarchy allows for code reuse, while providing specializations for different | |
124 | types of key objects, yielding maximum flexibility and performance | |
125 | """ | |
126 | ||
127 | """ | |
128 | A note on naming: 'Index' here refers to the fact that the goal of these classes is to | |
129 | perform and store precomputations on a set of keys, | |
130 | such as to accelerate subsequent operations involving these keys. | |
131 | They are not 'logical' indexes as in pandas; | |
132 | they are not permanently associated with any other data objects | |
133 | ||
134 | Note that these classes are not primarily intended to be used directly from the numpy namespace, | |
135 | but rather are intended for code reuse within a family of higher level operations, | |
136 | only the latter need to be part of the numpy API. | |
137 | ||
138 | That said, these classes can also be very useful | |
139 | in those places where the standard operations do not quite cover your needs, | |
140 | saving your from completely reinventing the wheel. | |
141 | """ | |
142 | ||
143 | ||
144 | class BaseIndex(object): | |
145 | """ | |
146 | minimal indexing functionality | |
147 | only provides unique and counts, but with optimal performance | |
148 | no grouping, or lex-keys are supported, | |
149 | or anything that would require an indirect sort | |
150 | """ | |
151 | def __init__(self, keys): | |
152 | """ | |
153 | keys is a flat array of possibly compsite type | |
154 | """ | |
155 | self.keys = np.asarray(keys).flatten() | |
156 | self.sorted = np.sort(self.keys) | |
157 | #the slicing points of the bins to reduce over | |
158 | self.flag = self.sorted[:-1] != self.sorted[1:] | |
159 | self.slices = np.concatenate(( | |
160 | [0], | |
161 | np.flatnonzero(self.flag)+1, | |
162 | [self.size])) | |
163 | ||
164 | @property | |
165 | def start(self): | |
166 | """start index of all bins""" | |
167 | return self.slices[:-1] | |
168 | @property | |
169 | def stop(self): | |
170 | """stop index of all bins""" | |
171 | return self.slices[1:] | |
172 | @property | |
173 | def groups(self): | |
174 | return len(self.start) | |
175 | @property | |
176 | def size(self): | |
177 | return self.keys.size | |
178 | ||
179 | ||
180 | @property | |
181 | def unique(self): | |
182 | """the first entry of each bin is a unique key""" | |
183 | return self.sorted[self.start] | |
184 | @property | |
185 | def uniques(self): | |
186 | """number of unique keys""" | |
187 | return len(self.slices)-1 | |
188 | ||
189 | @property | |
190 | def count(self): | |
191 | """number of times each key occurs""" | |
192 | return np.diff(self.slices) | |
193 | @property | |
194 | def uniform(self): | |
195 | """returns true if each key occurs an equal number of times""" | |
196 | return not np.any(np.diff(self.count)) | |
197 | ||
198 | ||
199 | class Index(BaseIndex): | |
200 | """ | |
201 | index object over a set of keys | |
202 | adds support for more extensive functionality, notably grouping | |
203 | """ | |
204 | def __init__(self, keys): | |
205 | """ | |
206 | keys is a flat array of possibly composite type | |
207 | """ | |
208 | self.keys = np.asarray(keys) | |
209 | #find indices which sort the keys | |
210 | self.sorter = np.argsort(self.keys) | |
211 | #computed sorted keys | |
212 | self.sorted = self.keys[self.sorter] | |
213 | #the slicing points of the bins to reduce over | |
214 | self.flag = self.sorted[:-1] != self.sorted[1:] | |
215 | self.slices = np.concatenate(( | |
216 | [0], | |
217 | np.flatnonzero(self.flag)+1, | |
218 | [self.size])) | |
219 | ||
220 | @property | |
221 | def index(self): | |
222 | """ive never run into any use cases for this; | |
223 | perhaps it was intended to be used to do group_by(keys).first(values)? | |
224 | in any case, included for backwards compatibility with np.unique""" | |
225 | return self.sorter[self.start] | |
226 | @property | |
227 | def rank(self): | |
228 | """how high in sorted list each key is""" | |
229 | return self.sorter.argsort() | |
230 | @property | |
231 | def sorted_group_rank_per_key(self): | |
232 | """find a better name for this? enumeration of sorted keys. also used in median implementation""" | |
233 | return np.cumsum(np.concatenate(([False], self.flag))) | |
234 | @property | |
235 | def inverse(self): | |
236 | """return index array that maps unique values back to original space""" | |
237 | return self.sorted_group_rank_per_key[self.rank] | |
238 | ||
239 | ||
240 | ## def where(self, other): | |
241 | ## """ | |
242 | ## determine at which indices a second set equals a first | |
243 | ## """ | |
244 | ## return np.searchsorted(self.unique, other) | |
245 | ||
246 | ||
247 | ||
248 | class ObjectIndex(Index): | |
249 | """ | |
250 | given axis enumerates the keys | |
251 | all other axes form the keys | |
252 | groups will be formed on the basis of bitwise equality | |
253 | ||
254 | should we retire objectindex? | |
255 | this can be integrated with regular index ala lexsort, no? | |
256 | not sure what is more readable though | |
257 | """ | |
258 | def __init__(self, keys, axis): | |
259 | self.axis = axis | |
260 | self.dtype = keys.dtype | |
261 | ||
262 | keys = np.swapaxes(keys, axis, 0) | |
263 | self.shape = keys.shape | |
264 | keys = array_as_object(keys) | |
265 | ||
266 | super(ObjectIndex, self).__init__(keys) | |
267 | ||
268 | @property | |
269 | def unique(self): | |
270 | """the first entry of each bin is a unique key""" | |
271 | sorted = array_as_typed(self.sorted, self.dtype, self.shape) | |
272 | return np.swapaxes(sorted[self.start], self.axis, 0) | |
273 | ||
274 | ||
275 | class LexIndex(Index): | |
276 | """ | |
277 | index object based on lexographic ordering of a tuple of key-arrays | |
278 | key arrays can be any type, including multi-dimensional, structed or voidobjects | |
279 | however, passing such fancy keys to lexindex may not be ideal from a performance perspective, | |
280 | as lexsort does not accept them as arguments directly, so we have to index and invert them first | |
281 | ||
282 | should you find yourself with such complex keys, it may be more efficient | |
283 | to place them into a structured array first | |
284 | ||
285 | note that multidimensional columns are indexed by their first column, | |
286 | and no per-column axis keyword is supplied, | |
287 | customization of column layout will have to be done at the call site | |
288 | ||
289 | """ | |
290 | def __init__(self, keys): | |
291 | self.keys = tuple(np.asarray(key) for key in keys) | |
292 | self.dtypes = tuple(key.dtype for key in self.keys) | |
293 | self.shapes = tuple(key.shape for key in self.keys) | |
294 | ||
295 | keyviews = tuple(array_as_object(key) if key.ndim>1 else key for key in self.keys) | |
296 | #find indices which sort the keys; complex keys which lexsort does not accept are bootstrapped from Index | |
297 | self.sorter = np.lexsort(tuple(Index(key).inverse if key.dtype.kind is 'V' else key for key in keyviews)) | |
298 | #computed sorted keys | |
299 | self.sorted = tuple(key[self.sorter] for key in keyviews) | |
300 | #the slicing points of the bins to reduce over | |
301 | self.flag = reduce( | |
302 | np.logical_or, | |
303 | (s[:-1] != s[1:] for s in self.sorted)) | |
304 | self.slices = np.concatenate(( | |
305 | [0], | |
306 | np.flatnonzero(self.flag)+1, | |
307 | [self.size])) | |
308 | ||
309 | @property | |
310 | def unique(self): | |
311 | """returns a tuple of unique key columns""" | |
312 | return tuple( | |
313 | (array_as_typed(s, dtype, shape) if len(shape)>1 else s)[self.start] | |
314 | for s, dtype, shape in zip(self.sorted, self.dtypes, self.shapes)) | |
315 | ||
316 | @property | |
317 | def size(self): | |
318 | return self.sorter.size | |
319 | ||
320 | class LexIndexSimple(Index): | |
321 | """ | |
322 | simplified LexIndex, which only accepts 1-d arrays of simple dtypes | |
323 | the more expressive LexIndex only has some logic overhead, | |
324 | in case all columns are indeed simple. not sure this is worth the extra code | |
325 | """ | |
326 | def __init__(self, keys): | |
327 | self.keys = tuple(np.asarray(key) for key in keys) | |
328 | self.sorter = np.lexsort(self.keys) | |
329 | #computed sorted keys | |
330 | self.sorted = tuple(key[self.sorter] for key in self.keys) | |
331 | #the slicing points of the bins to reduce over | |
332 | self.flag = reduce( | |
333 | np.logical_or, | |
334 | (s[:-1] != s[1:] for s in self.sorted)) | |
335 | self.slices = np.concatenate(( | |
336 | [0], | |
337 | np.flatnonzero(self.flag)+1, | |
338 | [self.size])) | |
339 | ||
340 | @property | |
341 | def unique(self): | |
342 | """the first entry of each bin is a unique key""" | |
343 | return tuple(s[self.start] for s in self.sorted) | |
344 | ||
345 | @property | |
346 | def size(self): | |
347 | return self.sorter.size | |
348 | ||
349 | ||
350 | ||
351 | def as_index(keys, axis = axis_default, base=False): | |
352 | """ | |
353 | casting rules for a keys object to an index | |
354 | ||
355 | the preferred semantics is that keys is a sequence of key objects, | |
356 | except when keys is an instance of tuple, | |
357 | in which case the zipped elements of the tuple are the key objects | |
358 | ||
359 | the axis keyword specifies the axis which enumerates the keys | |
360 | if axis is None, the keys array is flattened | |
361 | if axis is 0, the first axis enumerates the keys | |
362 | which of these two is the default depends on whether backwards_compatible == True | |
363 | ||
364 | if base==True, the most basic index possible is constructed. | |
365 | this avoids an indirect sort; if it isnt required, this has better performance | |
366 | """ | |
367 | if isinstance(keys, Index): | |
368 | return keys | |
369 | if isinstance(keys, tuple): | |
370 | return LexIndex(keys) | |
371 | try: | |
372 | keys = np.asarray(keys) | |
373 | except: | |
374 | raise TypeError('Given object does not form a valid set of keys') | |
375 | if axis is None: | |
376 | keys = keys.flatten() | |
377 | if keys.ndim==1: | |
378 | if base: | |
379 | return BaseIndex(keys) | |
380 | else: | |
381 | return Index(keys) | |
382 | else: | |
383 | return ObjectIndex(keys, axis) | |
384 | ||
385 | ||
386 | ||
387 | ||
388 | ||
389 | """ | |
390 | public API starts here | |
391 | """ | |
392 | ||
393 | ||
394 | class GroupBy(object): | |
395 | """ | |
396 | supports ufunc reduction | |
397 | should any other form of reduction be supported? | |
398 | not sure it should; more cleanly written externally i think, on a grouped iterables | |
399 | """ | |
400 | def __init__(self, keys, axis = 0): | |
401 | #we could inherit from Index, but the multiple specializations make | |
402 | #holding a reference to an index object more appropriate | |
403 | # note that we dont have backwards compatibility issues with groupby, | |
404 | #so we are going to use axis = 0 as a default | |
405 | #the multi-dimensional structure of a keys object is usualy meaningfull, | |
406 | #and value arguments are not automatically flattened either | |
407 | ||
408 | self.index = as_index(keys, axis) | |
409 | ||
410 | #forward interesting/'public' index properties | |
411 | @property | |
412 | def unique(self): | |
413 | return self.index.unique | |
414 | @property | |
415 | def count(self): | |
416 | return self.index.count | |
417 | @property | |
418 | def inverse(self): | |
419 | return self.index.inverse | |
420 | @property | |
421 | def rank(self): | |
422 | return self.index.rank | |
423 | ||
424 | ||
425 | #some different methods of chopping up a set of values by key | |
426 | #not sure they are all equally relevant, but i actually have real world use cases for most of them | |
427 | ||
428 | def as_iterable_from_iterable(self, values): | |
429 | """ | |
430 | grouping of an iterable. memory consumption depends on the amount of sorting required | |
431 | worst case, if index.sorter[-1] = 0, we need to consume the entire value iterable, | |
432 | before we can start yielding any output | |
433 | but to the extent that the keys come presorted, the grouping is lazy | |
434 | """ | |
435 | values = iter(enumerate(values)) | |
436 | cache = dict() | |
437 | def get_value(ti): | |
438 | try: | |
439 | return cache.pop(ti) | |
440 | except: | |
441 | while True: | |
442 | i, v = next(values) | |
443 | if i==ti: | |
444 | return v | |
445 | cache[i] = v | |
446 | s = iter(self.index.sorter) | |
447 | for c in self.count: | |
448 | yield (get_value(i) for i in itertools.islice(s, c)) | |
449 | ||
450 | def as_outoforder(self, values): | |
451 | """ | |
452 | group values, without regard for the ordering of self.index.unique | |
453 | consume values as they come, and yield key-group pairs as soon as they complete | |
454 | thi spproach is lazy, insofar as grouped values are close in their iterable | |
455 | """ | |
456 | from collections import defaultdict | |
457 | cache = defaultdict(list) | |
458 | count = self.count | |
459 | unique = self.unique | |
460 | key = (lambda i: unique[i]) if isinstance(unique, np.ndarray) else (lambda i: tuple(c[i] for c in unique)) | |
461 | for i,v in itertools.izip(self.inverse, values): | |
462 | cache[i].append(v) | |
463 | if len(cache[i]) == count[i]: | |
464 | yield key(i), cache.pop(i) | |
465 | ||
466 | def as_iterable_from_sequence(self, values): | |
467 | """ | |
468 | this is the preferred method if values has random access, | |
469 | but we dont want it completely in memory. | |
470 | like a big memory mapped file, for instance | |
471 | """ | |
472 | s = iter(self.index.sorter) | |
473 | for c in self.count: | |
474 | yield (values[i] for i in itertools.islice(s, c)) | |
475 | ||
476 | def as_array(self, values): | |
477 | """ | |
478 | return grouped values as an ndarray | |
479 | returns an array of shape [groups, groupsize, ungrouped-axes] | |
480 | this is only possible if index.uniform==True | |
481 | """ | |
482 | assert(self.index.uniform) | |
483 | values = np.asarray(values) | |
484 | values = values[self.index.sorter] | |
485 | return values.reshape(self.index.groups, self.count[0], *values.shape[1:]) | |
486 | ||
487 | def as_list(self, values): | |
488 | """return grouped values as a list of arrays, or a jagged-array""" | |
489 | values = np.asarray(values) | |
490 | values = values[self.index.sorter] | |
491 | return np.split(values, self.index.slices[1:-1], axis=0) | |
492 | ||
493 | def group(self, values): | |
494 | try: | |
495 | return self.as_array(values) | |
496 | except: | |
497 | return self.as_list(values) | |
498 | ||
499 | def __call__(self, values): | |
500 | """ | |
501 | not sure how i feel about this. explicit is better than implict | |
502 | but group_by(keys).group(values) does not sit too well with me either | |
503 | """ | |
504 | return self.unique, self.group(values) | |
505 | ||
506 | ||
507 | # ufunc based reduction methods | |
508 | ||
509 | def reduce(self, values, operator = np.add): | |
510 | """ | |
511 | reduce the values over identical key groups, using the ufunc operator | |
512 | reduction is over the first axis, which should have elements corresponding to the keys | |
513 | all other axes are treated indepenently for the sake of this reduction | |
514 | note that this code is only C-vectorized over the first axis | |
515 | that is fine is this inner loop is significant, but not so much if it isnt | |
516 | if we have only few keys, but values[0].size is substantial, a reduction via | |
517 | as_list may be preferable | |
518 | """ | |
519 | values = values[self.index.sorter] | |
520 | if values.ndim>1: | |
521 | return np.apply_along_axis( | |
522 | lambda slc: operator.reduceat(slc, self.index.start), | |
523 | 0, values) | |
524 | else: | |
525 | return operator.reduceat(values, self.index.start) | |
526 | ||
527 | def sum(self, values, axis=0): | |
528 | """compute the sum over each group""" | |
529 | values = np.asarray(values) | |
530 | if axis: values = np.rollaxis(values, axis) | |
531 | return self.unique, self.reduce(values) | |
532 | ||
533 | def mean(self, values, axis=0): | |
534 | """compute the mean over each group""" | |
535 | values = np.asarray(values) | |
536 | if axis: values = np.rollaxis(values, axis) | |
537 | count = self.count.reshape(-1,*(1,)*(values.ndim-1)) | |
538 | return self.unique, self.reduce(values) / count | |
539 | ||
540 | def var(self, values, axis=0): | |
541 | """compute the variance over each group""" | |
542 | values = np.asarray(values) | |
543 | if axis: values = np.rollaxis(values, axis) | |
544 | count = self.count.reshape(-1,*(1,)*(values.ndim-1)) | |
545 | mean = self.reduce(values) / count | |
546 | err = values - mean[self.inverse] | |
547 | return self.unique, self.reduce(err**2) / count | |
548 | ||
549 | def std(self, values, axis=0): | |
550 | """standard deviation over each group""" | |
551 | unique, var = self.var(values, axis) | |
552 | return unique, np.sqrt(var) | |
553 | ||
554 | def median(self, values, axis=0, average=True): | |
555 | """ | |
556 | compute the median value over each group. | |
557 | when average is true, the average is the two cental values is taken | |
558 | for groups with an even key-count | |
559 | """ | |
560 | values = np.asarray(values) | |
561 | ||
562 | mid_2 = self.index.start + self.index.stop | |
563 | hi = (mid_2 ) // 2 | |
564 | lo = (mid_2 - 1) // 2 | |
565 | ||
566 | #need this indirection for lex-index compatibility | |
567 | sorted_group_rank_per_key = self.index.sorted_group_rank_per_key | |
568 | ||
569 | def median1d(slc): | |
570 | #place values at correct keys; preconditions the upcoming lexsort | |
571 | slc = slc[self.index.sorter] | |
572 | #refine value sorting within each keygroup | |
573 | sorter = np.lexsort((slc, sorted_group_rank_per_key)) | |
574 | slc = slc[sorter] | |
575 | return (slc[lo]+slc[hi]) / 2 if average else slc[hi] | |
576 | ||
577 | values = np.asarray(values) | |
578 | if axis: values = np.rollaxis(values, axis) | |
579 | if values.ndim>1: #is trying to skip apply_along_axis somewhat premature optimization? | |
580 | values = np.apply_along_axis(median1d, 0, values) | |
581 | else: | |
582 | values = median1d(values) | |
583 | return self.unique, values | |
584 | ||
585 | def min(self, values, axis=0): | |
586 | """return the minimum within each group""" | |
587 | values = np.asarray(values) | |
588 | if axis: values = np.rollaxis(values, axis) | |
589 | return self.unique, self.reduce(values, np.minimum) | |
590 | ||
591 | def max(self, values, axis=0): | |
592 | """return the maximum within each group""" | |
593 | values = np.asarray(values) | |
594 | if axis: values = np.rollaxis(values, axis) | |
595 | return self.unique, self.reduce(values, np.maximum) | |
596 | ||
597 | def first(self, values, axis=0): | |
598 | """return values at first occurance of its associated key""" | |
599 | values = np.asarray(values) | |
600 | if axis: values = np.rollaxis(values, axis) | |
601 | return self.unique, values[self.index.sorter[self.index.start]] | |
602 | ||
603 | def last(self, values, axis=0): | |
604 | """return values at last occurance of its associated key""" | |
605 | values = np.asarray(values) | |
606 | if axis: values = np.rollaxis(values, axis) | |
607 | return self.unique, values[self.index.sorter[self.index.stop-1]] | |
608 | ||
609 | ||
610 | ||
611 | #implement iter interface? could simply do zip( group_by(keys)(values)), no? | |
612 | ||
613 | #just an alias, for those who dont like camelcase | |
614 | group_by = GroupBy | |
615 | #could also turn this into a function with optional values and reduction func. | |
616 | def group_by(keys, values=None, reduction=None, axis=0): | |
617 | g = GroupBy(keys, axis) | |
618 | if values is None: | |
619 | return g | |
620 | groups = g.group(values) | |
621 | if reduction is None: | |
622 | return groups | |
623 | return [reduction(group) for group in groups] | |
624 | ||
625 | ||
626 | ||
627 | """ | |
628 | some demonstration of the duplicity between this code and np.unique | |
629 | they share a lot of common functionality | |
630 | we cant quite bootstrap grouping from unique as is | |
631 | but unique can easily be reimplemented using the Index class | |
632 | """ | |
633 | ||
634 | def unique(keys, return_index = False, return_inverse = False, return_count = False, axis = axis_default): | |
635 | """ | |
636 | backwards compatible interface with numpy.unique | |
637 | in the long term i think the kwargs should be deprecated though | |
638 | cleaner to call index and its properties directly, | |
639 | should you want something beyond the interface provided | |
640 | """ | |
641 | index = as_index(keys, axis, base = not (return_index or return_inverse)) | |
642 | ||
643 | ret = index.unique, | |
644 | if return_index: | |
645 | ret = ret + (index.index,) | |
646 | if return_inverse: | |
647 | ret = ret + (index.inverse,) | |
648 | if return_count: | |
649 | ret = ret + (index.count,) | |
650 | return ret[0] if len(ret) == 1 else ret | |
651 | ||
652 | def count(keys, axis = axis_default): | |
653 | """numpy work-alike of collections.Counter""" | |
654 | index = as_index(keys, axis, base = True) | |
655 | return index.unique, index.count | |
656 | ||
657 | def multiplicity(keys, axis = axis_default): | |
658 | """ | |
659 | return the multiplicity of each key, or how often it occurs in the set | |
660 | note that we are not interested in the unique values for this operation, | |
661 | casting doubt on the current numpy design which places np.unique | |
662 | at the center of the interface for this kind of operation | |
663 | given how often i use multiplicity, id like to have it in the numpy namespace | |
664 | """ | |
665 | index = as_index(keys, axis) | |
666 | return index.count[index.inverse] | |
667 | ||
668 | def rank(keys, axis = axis_default): | |
669 | """ | |
670 | where each item is in the pecking order. | |
671 | not sure this should be part of the public api, cant think of any use-case right away | |
672 | plus, we have a namespace conflict | |
673 | """ | |
674 | index = as_index(keys, axis) | |
675 | return index.rank | |
676 | ||
677 | ||
678 | def incidence(boundary): | |
679 | """ | |
680 | given an Nxm matrix containing boundary info between simplices, | |
681 | compute indidence info matrix | |
682 | not to be part of numpy API, to be sure, just something im playing with | |
683 | """ | |
684 | return group_by(boundary).group(np.arange(boundary.size) // boundary.shape[1]) | |
685 | ||
686 | ||
687 | ||
688 | ||
689 | def test_basic(): | |
690 | ||
691 | keys = ["e", "b", "b", "c", "d", "e", "e", 'a'] | |
692 | values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1] | |
693 | ||
694 | ## keys = np.array(["a", "b", "b", "c", "d", "e", "e",'d','a','c']) | |
695 | ## values = np.array([1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01, 1,2,3]) | |
696 | ||
697 | print 'two methods of computing non-reducing group' | |
698 | print 'as iterable' | |
699 | g = group_by(keys) | |
700 | for k,v in zip(g.unique, g.as_iterable_from_sequence(values)): | |
701 | print k, list(v) | |
702 | print 'as list' | |
703 | for k,v in zip(*group_by(keys)(values)): | |
704 | print k, v | |
705 | ||
706 | print 'some reducing group operations' | |
707 | g = group_by(keys) | |
708 | unique_keys, reduced_values = g.median(values) | |
709 | print 'per group median' | |
710 | print reduced_values | |
711 | unique_keys, reduced_values = g.mean(values) | |
712 | print 'per group mean' | |
713 | print reduced_values | |
714 | unique_keys, reduced_values = g.std(values) | |
715 | print 'per group std' | |
716 | print reduced_values | |
717 | reduced_values = g.reduce(np.array(values), np.minimum) #alternate way of calling | |
718 | print 'per group min' | |
719 | print reduced_values | |
720 | unique_keys, reduced_values = g.max(values) | |
721 | print 'per group max' | |
722 | print reduced_values | |
723 | ||
724 | ||
725 | ##test_basic() | |
726 | ##quit() | |
727 | ||
728 | ||
729 | def test_lex_median(): | |
730 | """ | |
731 | for making sure i squased all bugs related to fancy-keys and median filter implementation | |
732 | """ | |
733 | keys1 = ["e", "b", "b", "c", "d", "e", "e", 'a'] | |
734 | keys2 = ["b", "b", "b", "d", "e", "e", 'e', 'e'] | |
735 | ## keys3 = np.random.randint(0,2,(8,2)) | |
736 | values = [1.2, 4.5, 4.3, 2.0, 5.6, 8.8, 9.1, 1] | |
737 | ||
738 | unique, median = group_by((keys1, keys2)).median(values) | |
739 | for i in zip(zip(*unique), median): | |
740 | print i | |
741 | ||
742 | ##test_lex_median() | |
743 | ##quit() | |
744 | ||
745 | ||
746 | def test_fancy_keys(): | |
747 | """ | |
748 | test Index subclasses | |
749 | """ | |
750 | keys = np.random.randint(0, 2, (20,3)).astype(np.int8) | |
751 | values = np.random.randint(-1,2,(20,4)) | |
752 | ||
753 | ||
754 | #all these various datastructures should produce the same behavior | |
755 | #multiplicity is a nice unit test, since it draws on most of the low level functionality | |
756 | if backwards_compatible: | |
757 | assert(np.all( | |
758 | multiplicity(keys, axis=0) == #void object indexing | |
759 | multiplicity(tuple(keys.T)))) #lexographic indexing | |
760 | assert(np.all( | |
761 | multiplicity(keys, axis=0) == #void object indexing | |
762 | multiplicity(as_struct_array(keys)))) #struct array indexing | |
763 | else: | |
764 | assert(np.all( | |
765 | multiplicity(keys) == #void object indexing | |
766 | multiplicity(tuple(keys.T)))) #lexographic indexing | |
767 | assert(np.all( | |
768 | multiplicity(keys) == #void object indexing | |
769 | multiplicity(as_struct_array(keys)))) #struct array indexing | |
770 | ||
771 | #lets go mixing some dtypes! | |
772 | floatkeys = np.zeros(len(keys)) | |
773 | floatkeys[0] = 8.8 | |
774 | print 'sum per group of identical rows using struct key' | |
775 | g = group_by(as_struct_array(keys, floatkeys)) | |
776 | for e in zip(g.count, *g.sum(values)): | |
777 | print e | |
778 | print 'sum per group of identical rows using lex of nd-key' | |
779 | g = group_by(( keys, floatkeys)) | |
780 | for e in zip(zip(*g.unique), g.sum(values)[1]): | |
781 | print e | |
782 | print 'sum per group of identical rows using lex of struct key' | |
783 | g = group_by((as_struct_array( keys), floatkeys)) | |
784 | for e in zip(zip(*g.unique), g.sum(values)[1]): | |
785 | print e | |
786 | ||
787 | #showcase enhanced unique functionality | |
788 | images = np.random.rand(4,4,4) | |
789 | #shuffle the images; this is a giant mess now; how to find the unique ones? | |
790 | shuffled = images[np.random.randint(0,4,200)] | |
791 | #there you go | |
792 | if backwards_compatible: | |
793 | print unique(shuffled, axis=0) | |
794 | else: | |
795 | print unique(shuffled) | |
796 | ||
797 | ||
798 | ##g = test_fancy_keys() | |
799 | ##quit() | |
800 | ||
801 | ||
802 | def test_radial(): | |
803 | x = np.linspace(-2,2, 64) | |
804 | y = x[:, None] | |
805 | x = x[None, :] | |
806 | R = np.sqrt( x**2+y**2) | |
807 | ||
808 | def airy(r, sigma): | |
809 | from scipy.special import j1 | |
810 | r = r / sigma * np.sqrt(2) | |
811 | a = (2*j1(r)/r)**2 | |
812 | a[r==0] = 1 | |
813 | return a | |
814 | def gauss(r, sigma): | |
815 | return np.exp(-(r/sigma)**2) | |
816 | ||
817 | distribution = np.random.choice([gauss, airy])(R, 0.3) | |
818 | sample = np.random.poisson(distribution*200+10).astype(np.float) | |
819 | ||
820 | import matplotlib.pyplot as pp | |
821 | #is this an airy or gaussian function? hard to tell with all this noise! | |
822 | pp.imshow(sample, interpolation='nearest', cmap='gray') | |
823 | pp.show() | |
824 | #radial reduction to the rescue! | |
825 | #if we are sampling an airy function, you will see a small but significant rise around x=1 | |
826 | g = group_by(np.round(R, 5).flatten()) | |
827 | pp.errorbar( | |
828 | g.unique, | |
829 | g.mean(sample.flatten())[1], | |
830 | g.std (sample.flatten())[1] / np.sqrt(g.count)) | |
831 | pp.xlim(0,2) | |
832 | pp.show() | |
833 | ||
834 | ##test_radial() | |
835 | ##quit() | |
836 | ||
837 | def test_meshing(): | |
838 | """ | |
839 | meshing example | |
840 | demonstrates the use of multiplicity, and group.median | |
841 | """ | |
842 | #set up some random points, and get their delaunay triangulation | |
843 | points = np.random.random((20000,2))*2-1 | |
844 | points = points[np.linalg.norm(points,axis=1) < 1] | |
845 | from scipy.spatial.qhull import Delaunay | |
846 | d = Delaunay(points) | |
847 | tris = d.simplices | |
848 | ||
849 | #the operations provided in this module allow us to express potentially complex | |
850 | #computational geometry questions elegantly in numpy | |
851 | #Delaunay.neighbors could be used as well, | |
852 | #but the point is to express this in pure numpy, without additional library functionaoty | |
853 | edges = tris[:,[[0,1],[1,2],[2,0]]].reshape(-1,2) | |
854 | sorted_edges = np.where(edges[:,0:1]<edges[:,1:2], edges, edges[:,::-1]) | |
855 | #we test to see how often each edge occurs, or how many indicent simplices it has | |
856 | #this is a very general method of finding the boundary of any topology | |
857 | #and we can do so here with only one simple and readable command, multiplicity == 1 | |
858 | if backwards_compatible: | |
859 | boundary_edges = edges[multiplicity(sorted_edges, axis=0)==1] | |
860 | else: | |
861 | boundary_edges = edges[multiplicity(sorted_edges)==1] | |
862 | boundary_points = unique(boundary_edges) | |
863 | ||
864 | if False: | |
865 | print boundary_edges | |
866 | print incidence(boundary_edges) | |
867 | ||
868 | ||
869 | #create some random values on faces | |
870 | #we want to smooth them over the mesh to create a nice hilly landscape | |
871 | face_values = np.random.normal(size=d.nsimplex) | |
872 | #add some salt and pepper noise, to make our problem more interesting | |
873 | face_values[np.random.randint(d.nsimplex, size=10)] += 1000 | |
874 | ||
875 | #start with a median step, to remove salt-and-pepper noise | |
876 | #toggle to mean to see the effect of the median filter | |
877 | g = group_by(tris.flatten()) | |
878 | prestep = g.median if True else g.mean | |
879 | vertex_values = prestep(np.repeat(face_values, 3))[1] | |
880 | vertex_values[boundary_points] = 0 | |
881 | ||
882 | #actually, we can compute the mean without grouping | |
883 | tris_per_vert = g.count | |
884 | def scatter(x): | |
885 | r = np.zeros(d.npoints, x.dtype) | |
886 | for idx in tris.T: np.add.at(r, idx, x) | |
887 | return r / tris_per_vert | |
888 | def gather(x): | |
889 | return x[tris].mean(axis=1) | |
890 | ||
891 | #iterate a little | |
892 | for i in range(100): | |
893 | face_values = gather(vertex_values) | |
894 | vertex_values = scatter(face_values) | |
895 | vertex_values[boundary_points] = 0 | |
896 | ||
897 | ||
898 | #display our nicely rolling hills and their boundary | |
899 | import matplotlib.pyplot as plt | |
900 | x, y = points.T | |
901 | plt.tripcolor(x,y, triangles = tris, facecolors = face_values) | |
902 | plt.scatter(x[boundary_points], y[boundary_points]) | |
903 | plt.xlim(-1,1) | |
904 | plt.ylim(-1,1) | |
905 | plt.axis('equal') | |
906 | plt.show() | |
907 | ||
908 | d = test_meshing() |