• API
• FAQ
• Tools
• Trends
• Archive
SHARE
TWEET

# Fast Gaussian KDE

a guest Jul 26th, 2010 1,219 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. """
2. A faster gaussian kernel density estimate (KDE).
3. Intended for computing the KDE on a regular grid (different use case than
4. scipy's original scipy.stats.kde.gaussian_kde()).
5. -Joe Kington
6. """
8.
9. import numpy as np
10. import scipy as sp
11. import scipy.sparse
12. import scipy.signal
13.
14. def fast_kde(x, y, gridsize=(200, 200), extents=None, nocorrelation=False, weights=None):
15.     """
16.    Performs a gaussian kernel density estimate over a regular grid using a
17.    convolution of the gaussian kernel with a 2D histogram of the data.
18.
19.    This function is typically several orders of magnitude faster than
20.    scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and
21.    produces an essentially identical result.
22.
23.    Input:
24.        x: The x-coords of the input data points
25.        y: The y-coords of the input data points
26.        gridsize: (default: 200x200) A (nx,ny) tuple of the size of the output
27.            grid
28.        extents: (default: extent of input data) A (xmin, xmax, ymin, ymax)
29.            tuple of the extents of output grid
30.        nocorrelation: (default: False) If True, the correlation between the
31.            x and y coords will be ignored when preforming the KDE.
32.        weights: (default: None) An array of the same shape as x & y that
33.            weighs each sample (x_i, y_i) by each value in weights (w_i).
34.            Defaults to an array of ones the same size as x & y.
35.    Output:
36.        A gridded 2D kernel density estimate of the input points.
37.    """
38.     #---- Setup --------------------------------------------------------------
39.     x, y = np.asarray(x), np.asarray(y)
40.     x, y = np.squeeze(x), np.squeeze(y)
41.
42.     if x.size != y.size:
43.         raise ValueError('Input x & y arrays must be the same size!')
44.
45.     nx, ny = gridsize
46.     n = x.size
47.
48.     if weights is None:
49.         # Default: Weight all points equally
50.         weights = np.ones(n)
51.     else:
52.         weights = np.squeeze(np.asarray(weights))
53.         if weights.size != x.size:
54.             raise ValueError('Input weights must be an array of the same size'
55.                     ' as input x & y arrays!')
56.
57.     # Default extents are the extent of the data
58.     if extents is None:
59.         xmin, xmax = x.min(), x.max()
60.         ymin, ymax = y.min(), y.max()
61.     else:
62.         xmin, xmax, ymin, ymax = map(float, extents)
63.     dx = (xmax - xmin) / (nx - 1)
64.     dy = (ymax - ymin) / (ny - 1)
65.
66.     #---- Preliminary Calculations -------------------------------------------
67.
68.     # First convert x & y over to pixel coordinates
69.     # (Avoiding np.digitize due to excessive memory usage!)
70.     xyi = np.vstack((x,y)).T
71.     xyi -= [xmin, ymin]
72.     xyi /= [dx, dy]
73.     xyi = np.floor(xyi, xyi).T
74.
75.     # Next, make a 2D histogram of x & y
76.     # Avoiding np.histogram2d due to excessive memory usage with many points
77.     grid = sp.sparse.coo_matrix((weights, xyi), shape=(nx, ny)).toarray()
78.
79.     # Calculate the covariance matrix (in pixel coords)
80.     cov = np.cov(xyi)
81.
82.     if nocorrelation:
83.         cov[1,0] = 0
84.         cov[0,1] = 0
85.
86.     # Scaling factor for bandwidth
87.     scotts_factor = np.power(n, -1.0 / 6) # For 2D
88.
89.     #---- Make the gaussian kernel -------------------------------------------
90.
91.     # First, determine how big the kernel needs to be
92.     std_devs = np.diag(np.sqrt(cov))
93.     kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
94.
95.     # Determine the bandwidth to use for the gaussian kernel
96.     inv_cov = np.linalg.inv(cov * scotts_factor**2)
97.
98.     # x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center
99.     xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0
100.     yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0
101.     xx, yy = np.meshgrid(xx, yy)
102.
103.     # Then evaluate the gaussian function on the kernel grid
104.     kernel = np.vstack((xx.flatten(), yy.flatten()))
105.     kernel = np.dot(inv_cov, kernel) * kernel
106.     kernel = np.sum(kernel, axis=0) / 2.0
107.     kernel = np.exp(-kernel)
108.     kernel = kernel.reshape((kern_ny, kern_nx))
109.
110.     #---- Produce the kernel density estimate --------------------------------
111.
112.     # Convolve the gaussian kernel with the 2D histogram, producing a gaussian
113.     # kernel density estimate on a regular grid
114.     grid = sp.signal.convolve2d(grid, kernel, mode='same', boundary='fill').T
115.
116.     # Normalization factor to divide result by so that units are in the same
117.     # units as scipy.stats.kde.gaussian_kde's output.
118.     norm_factor = 2 * np.pi * cov * scotts_factor**2
119.     norm_factor = np.linalg.det(norm_factor)
120.     norm_factor = n * dx * dy * np.sqrt(norm_factor)
121.
122.     # Normalize the result
123.     grid /= norm_factor
124.
125.     return np.flipud(grid)
RAW Paste Data
Top