Advertisement
Guest User

Fast Gaussian KDE

a guest
Jul 26th, 2010
1,613
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.72 KB | None | 0 0
  1. """
  2. A faster gaussian kernel density estimate (KDE).
  3. Intended for computing the KDE on a regular grid (different use case than
  4. scipy's original scipy.stats.kde.gaussian_kde()).
  5. -Joe Kington
  6. """
  7. __license__ = 'MIT License <http://www.opensource.org/licenses/mit-license.php>'
  8.  
  9. import numpy as np
  10. import scipy as sp
  11. import scipy.sparse
  12. import scipy.signal
  13.  
  14. def fast_kde(x, y, gridsize=(200, 200), extents=None, nocorrelation=False, weights=None):
  15.     """
  16.    Performs a gaussian kernel density estimate over a regular grid using a
  17.    convolution of the gaussian kernel with a 2D histogram of the data.
  18.  
  19.    This function is typically several orders of magnitude faster than
  20.    scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and
  21.    produces an essentially identical result.
  22.  
  23.    Input:
  24.        x: The x-coords of the input data points
  25.        y: The y-coords of the input data points
  26.        gridsize: (default: 200x200) A (nx,ny) tuple of the size of the output
  27.            grid
  28.        extents: (default: extent of input data) A (xmin, xmax, ymin, ymax)
  29.            tuple of the extents of output grid
  30.        nocorrelation: (default: False) If True, the correlation between the
  31.            x and y coords will be ignored when preforming the KDE.
  32.        weights: (default: None) An array of the same shape as x & y that
  33.            weighs each sample (x_i, y_i) by each value in weights (w_i).
  34.            Defaults to an array of ones the same size as x & y.
  35.    Output:
  36.        A gridded 2D kernel density estimate of the input points.
  37.    """
  38.     #---- Setup --------------------------------------------------------------
  39.     x, y = np.asarray(x), np.asarray(y)
  40.     x, y = np.squeeze(x), np.squeeze(y)
  41.    
  42.     if x.size != y.size:
  43.         raise ValueError('Input x & y arrays must be the same size!')
  44.  
  45.     nx, ny = gridsize
  46.     n = x.size
  47.  
  48.     if weights is None:
  49.         # Default: Weight all points equally
  50.         weights = np.ones(n)
  51.     else:
  52.         weights = np.squeeze(np.asarray(weights))
  53.         if weights.size != x.size:
  54.             raise ValueError('Input weights must be an array of the same size'
  55.                     ' as input x & y arrays!')
  56.  
  57.     # Default extents are the extent of the data
  58.     if extents is None:
  59.         xmin, xmax = x.min(), x.max()
  60.         ymin, ymax = y.min(), y.max()
  61.     else:
  62.         xmin, xmax, ymin, ymax = map(float, extents)
  63.     dx = (xmax - xmin) / (nx - 1)
  64.     dy = (ymax - ymin) / (ny - 1)
  65.  
  66.     #---- Preliminary Calculations -------------------------------------------
  67.  
  68.     # First convert x & y over to pixel coordinates
  69.     # (Avoiding np.digitize due to excessive memory usage!)
  70.     xyi = np.vstack((x,y)).T
  71.     xyi -= [xmin, ymin]
  72.     xyi /= [dx, dy]
  73.     xyi = np.floor(xyi, xyi).T
  74.  
  75.     # Next, make a 2D histogram of x & y
  76.     # Avoiding np.histogram2d due to excessive memory usage with many points
  77.     grid = sp.sparse.coo_matrix((weights, xyi), shape=(nx, ny)).toarray()
  78.  
  79.     # Calculate the covariance matrix (in pixel coords)
  80.     cov = np.cov(xyi)
  81.  
  82.     if nocorrelation:
  83.         cov[1,0] = 0
  84.         cov[0,1] = 0
  85.  
  86.     # Scaling factor for bandwidth
  87.     scotts_factor = np.power(n, -1.0 / 6) # For 2D
  88.  
  89.     #---- Make the gaussian kernel -------------------------------------------
  90.  
  91.     # First, determine how big the kernel needs to be
  92.     std_devs = np.diag(np.sqrt(cov))
  93.     kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
  94.  
  95.     # Determine the bandwidth to use for the gaussian kernel
  96.     inv_cov = np.linalg.inv(cov * scotts_factor**2)
  97.  
  98.     # x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center
  99.     xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0
  100.     yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0
  101.     xx, yy = np.meshgrid(xx, yy)
  102.  
  103.     # Then evaluate the gaussian function on the kernel grid
  104.     kernel = np.vstack((xx.flatten(), yy.flatten()))
  105.     kernel = np.dot(inv_cov, kernel) * kernel
  106.     kernel = np.sum(kernel, axis=0) / 2.0
  107.     kernel = np.exp(-kernel)
  108.     kernel = kernel.reshape((kern_ny, kern_nx))
  109.  
  110.     #---- Produce the kernel density estimate --------------------------------
  111.  
  112.     # Convolve the gaussian kernel with the 2D histogram, producing a gaussian
  113.     # kernel density estimate on a regular grid
  114.     grid = sp.signal.convolve2d(grid, kernel, mode='same', boundary='fill').T
  115.  
  116.     # Normalization factor to divide result by so that units are in the same
  117.     # units as scipy.stats.kde.gaussian_kde's output.  
  118.     norm_factor = 2 * np.pi * cov * scotts_factor**2
  119.     norm_factor = np.linalg.det(norm_factor)
  120.     norm_factor = n * dx * dy * np.sqrt(norm_factor)
  121.  
  122.     # Normalize the result
  123.     grid /= norm_factor
  124.  
  125.     return np.flipud(grid)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement