Advertisement
beegie_b

SparseRBM

Dec 2nd, 2013
236
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.12 KB | None | 0 0
  1. __author__ = 'Miroslaw Horbal'
  2. __email__ = 'miroslaw@gmail.com'
  3. __date__ = 'Dec 02, 2013'
  4.  
  5. import numpy as np
  6. from numpy import random
  7. from scipy import sparse
  8. from matplotlib.mlab import find
  9. import time
  10. from sys import stdout
  11.  
  12. class SparseRBM(object):
  13.     """
  14.    SparseRBM implements an RBM using a sparse subsampling procedure to
  15.    speed up runtime. The idea is to perform gibbs sampling on a subset of
  16.    the visible layer instead of the full visible layer. This is achieved by
  17.    passing along a list of indexes for the active features in each gibbs step.
  18.    Gibbs is run on the active features while all other features are assumed to
  19.    have an activation probability of 0.
  20.    
  21.    This current implementation only considered features that are active in at
  22.    least one data point in the mini batch. Other selection procedures can be
  23.    used by implementing a different 'make_batches' procedure in the source code
  24.    
  25.    Training is performed using stochastic gradient descent (SGD) with momentum
  26.    using fast persistent contrastive divergence (FPCD-n)
  27.    
  28.    Documentation on RBM: http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf
  29.    Documentation on FPCD: http://www.cs.toronto.edu/~tijmen/fpcd/fpcd.pdf
  30.    
  31.    Parameters:
  32.        visible_size: The size of the visible layer
  33.        hidden_size: The size of hidden layer
  34.        epochs: The number of training iterations over the dataset (default 10)
  35.        batch_size: The size of each individual mini-batch (default 100)
  36.        n: The number of gibbs steps in the negative phase of training (default 1)
  37.        learn_rate: The learning rate of SGD (default 0.1)
  38.        momentum: The momentum of gradient updates in SGD (default 0.9)
  39.        fw_decay: The decay rate of the fast-weights in FPCD
  40.                  (the default of 0.98 gives good emperical results)
  41.        l2: The magnitude of L2 regularization (default 0.0)
  42.        verbose: Display costs and runtime during training (default False)
  43.    
  44.    Set attributes:
  45.        W: The weights of the RBM
  46.        vbias: The visible bias term
  47.        hbias: The hidden bias term
  48.        fW: The fast-weights used for FPCD
  49.        total_epochs: The total number of epochs this RBM has been trained
  50.        cost_hist: A list of cost histories of each mini-batch during training
  51.    """
  52.     def __init__(self, visible_size, hidden_size, epochs=10, batch_size=100,
  53.                  n=1, learn_rate=0.1, momentum=0.9, fw_decay=0.98, l2=0.0,
  54.                  verbose=False):
  55.         self.visible_size = visible_size
  56.         self.hidden_size = hidden_size
  57.         self.hbias = np.zeros(hidden_size)
  58.         self.vbias = np.zeros(visible_size)
  59.         self.W = random.randn(hidden_size, visible_size) / \
  60.                  np.sqrt(hidden_size + visible_size)
  61.         self.batch_size = batch_size
  62.         self.epochs = epochs
  63.         self.learn_rate = learn_rate
  64.         self.n = n
  65.         self.l2 = l2
  66.         self.momentum = momentum
  67.         self.verbose = verbose
  68.         self.fw_decay = fw_decay
  69.         self.fW = np.zeros(self.W.shape)
  70.         self.flr = learn_rate * np.exp(1) # fast learn rate heuristic
  71.         self.p = np.zeros((self.batch_size, self.hidden_size))
  72.         self._prevgrad = {'W': np.zeros(self.W.shape),
  73.                           'hbias': np.zeros(hidden_size),
  74.                           'vbias': np.zeros(visible_size)}
  75.         self.total_epochs = 0
  76.         self.cost_hist = []
  77.    
  78.     def __repr__(self):
  79.         return ('SparseRBM(visible_size=%i, hidden_size=%i, epochs=%i, '
  80.                 'batch_size=%i, learn_rate=%.4g, momentum=%.4g, l2=%.4g, '
  81.                 'fw_decay=%.4g, verbose=%s)') %(self.visible_size, self.hidden_size,
  82.                  self.epochs, self.batch_size, self.learn_rate, self.momentum,
  83.                  self.l2, self.fw_decay, self.verbose)
  84.                  
  85.     def propup(self, vis, subsample_ids=None, fw=False):
  86.         """
  87.        Compute and sample P(h | v)
  88.        
  89.        Note: Calling this function without subsample_ids can be used to perform
  90.              a transformation of the full dataset to be used in a ML pipeline
  91.                      
  92.        Args:
  93.            vis: The state of the visible layer - shape (m, len(subsample_ids))
  94.            subsample_ids: The feature indices used in this subsample
  95.            fw: Boolean controlling if fast weights should be used
  96.        
  97.        Returns:
  98.            a 3-tuple: (sample, probabilities, linear ouputs before nonlinearity)
  99.        """
  100.         W = self.fW + self.W if fw else self.W
  101.         if subsample_ids is not None:
  102.             W = W[:, subsample_ids]
  103.         pre_non_lin = vis.dot(W.T) + self.hbias
  104.         non_lin = sigmoid(pre_non_lin)
  105.         sample = sample_bernoulli(non_lin)
  106.         return (sample, non_lin, pre_non_lin)
  107.    
  108.     def propdown(self, hid, subsample_ids=None, fw=False):
  109.         """
  110.        Compute and sample P(v | h)
  111.        
  112.        Note: Calling this function without subsample_ids with a large
  113.              visible_size will be extremely slow and may potentially cause
  114.              memory issues resulting in massive slowdowns of your computer.
  115.              You've been warned!
  116.        
  117.        Args:
  118.            hid: The state of the hidden layer - shape: (m, hidden_size)
  119.            subsample_ids: The visible indices we want from this subsample
  120.            fw: Boolean controlling if fast weights should be used
  121.        
  122.        Returns:
  123.            a 3-tuple: (sample, probabilities, linear ouputs before nonlinearity)
  124.        """
  125.         W = self.fW + self.W if fw else self.W
  126.         vbias = self.vbias
  127.         if subsample_ids is not None:
  128.             W = W[:, subsample_ids]
  129.             vbias = vbias[subsample_ids]
  130.         pre_non_lin = hid.dot(W) + vbias
  131.         non_lin = sigmoid(pre_non_lin)
  132.         sample = sample_bernoulli(non_lin)
  133.         return (sample, non_lin, pre_non_lin)
  134.    
  135.     def gibbs_hvh(self, h, meanfield=False, **args):
  136.         """
  137.        Performs one step of gibbs sampling given the hidden state
  138.        
  139.        Args:
  140.            h: The hidden state
  141.            meanfield: Boolean controlling if we want to use the mean field values
  142.                       during gibbs instead of samples
  143.            **args: arguments to pass to propup/propdown procedures
  144.            
  145.        Returns:
  146.            a 2-tuple of 3-tuples (visible samples, hidden samples)
  147.        """
  148.         v_samples = self.propdown(h, **args)
  149.         v = v_samples[1] if meanfield else v_samples[0]
  150.         h_samples = self.propup(v, **args)
  151.         return v_samples, h_samples
  152.    
  153.     def gibbs_vhv(self, v, meanfield=False, **args):
  154.         """
  155.        Performs one step of gibbs sampling given the visible state
  156.        
  157.        Args:
  158.            v: The visible state
  159.            meanfield: Boolean controlling if we want to use the mean field values
  160.                       during gibbs instead of samples
  161.            **args: arguments to pass to propup/propdown procedures
  162.            
  163.        Returns:
  164.            a 2-tuple of 3-tuples (visible samples, hidden samples)
  165.        """
  166.         h_samples = self.propup(v, **args)
  167.         h = h_samples[1] if meanfield else h_samples[-1]
  168.         v_samples = self.propdown(h, **args)
  169.         return v_samples, h_samples
  170.    
  171.     def cost(self, v, subsample_ids=None):
  172.         """
  173.        Compute the 'cost' and gradient using FPCD.
  174.        
  175.        NOTE: The 'cost' is not an actual cost metric, it is only the
  176.              approximate reconstruction error of the visible sample. What RBMs
  177.              are actually minimizing is an energy function
  178.        
  179.        Args:
  180.            v: The visible state
  181.            subsample_ids: The visible indices we want to consider when computing
  182.                           the reconstruction error and gradient
  183.        
  184.        Returns:
  185.            cost: The reconstruction error
  186.            grad: A dict containing gradient approximations for W, vbias, hbias
  187.        """
  188.         num_points = v.shape[0]
  189.         # positive phase
  190.         pos_h_samples = self.propup(v, subsample_ids)
  191.         # negative phase
  192.         nh0 = self.p[:num_points]
  193.         for i in xrange(self.n):
  194.             neg_v_samples, neg_h_samples = self.gibbs_hvh(nh0,
  195.                                                     subsample_ids=subsample_ids,
  196.                                                     fw=True)
  197.             nh0 = neg_h_samples[0]
  198.         # compute gradients
  199.         grad = self._grad(v, pos_h_samples, neg_v_samples, neg_h_samples)
  200.         self.p[:num_points] = nh0
  201.         # compute reconstruction error
  202.         reconstruction = self.propdown(pos_h_samples[0], subsample_ids)[1]
  203.         cost = np.abs(v - reconstruction).sum(1).mean(0)
  204.         return cost, grad
  205.    
  206.     def _grad(self, pv0, pos_h, neg_v, neg_h):
  207.         """
  208.        Helper to compute the gradient approximation
  209.        
  210.        Args:
  211.            pv0: visible layer state from the positive phase
  212.            pos_h: hidden layer state from the postive phase
  213.            neg_v: visible layer state from the negative phase
  214.            neg_h: hidden layer state from the negative phase
  215.        
  216.        Returns:
  217.            The gradient dict required in the cost function
  218.        """
  219.         grad = {}
  220.         num_points = pv0.shape[0]
  221.         E_v = neg_v[1]
  222.         E_h = neg_h[1]
  223.         E_hgv = pos_h[1]
  224.         E_vh = E_h.T.dot(E_v)
  225.         E_vhgv = E_hgv.T.dot(pv0)
  226.         grad['W'] = (E_vhgv - E_vh) / num_points
  227.         grad['vbias'] = (pv0 - E_v).mean(0)
  228.         grad['hbias'] = (E_hgv - E_h).mean(0)
  229.         return grad
  230.    
  231.     def update(self, grad, subsample_ids=None):
  232.         """
  233.        Update the RBM parameters W, vbias, hbias, fW using momentum
  234.        
  235.        Args:
  236.            grad: The gradient dict returned from the cost function
  237.            subsample_ids: The subsample indices used when generating the gradient
  238.        
  239.        Returns:
  240.            self
  241.        """
  242.         prev_grad = self._prevgrad
  243.         dW0 = grad['W']
  244.         dv0 = grad['vbias']
  245.         dh0 = grad['hbias']
  246.         if subsample_ids is not None:
  247.             dv0 = np.zeros(self.vbias.shape)
  248.             dW0 = np.zeros(self.W.shape)
  249.             dv0[subsample_ids] = grad['vbias']
  250.             dW0[:, subsample_ids] = grad['W']
  251.         dW = self.momentum * prev_grad['W'] + \
  252.              self.learn_rate * (dW0 - self.l2 * self.W)
  253.         dh = self.momentum * prev_grad['hbias'] + self.learn_rate * dh0
  254.         dv = self.momentum * prev_grad['vbias'] + self.learn_rate * dv0
  255.         self.W += dW
  256.         self.hbias += dh
  257.         self.vbias += dv
  258.         self.fW = self.fw_decay * self.fW + self.flr * dW0 # Fast weight update for PCD
  259.         self._prevgrad['W'] = dW
  260.         self._prevgrad['hbias'] = dh
  261.         self._prevgrad['vbias'] = dv
  262.         return self
  263.    
  264.     def transform(self, data):
  265.         """
  266.        Perform a transformation of the data to activation probabilities of the
  267.        hidden layer
  268.        
  269.        Args:
  270.            data: the data to be transformed interpreted as the visible layer
  271.        
  272.        Returns:
  273.            The activation probabilities p(h | v)
  274.        """
  275.         return self.propup(data)[1]
  276.        
  277.     def fit(self, data):
  278.         """
  279.        Trains the RBM using stochastic gradient descent for self.epochs
  280.        iterations over the dataset
  281.        
  282.        Note: contrary to idioms from Scikit-Learn, calling fit will not
  283.              reinitialize the weights of the model. Training will continue
  284.              given the weights and biases the RBM has configured
  285.              
  286.        Args:
  287.            data - the data to be interpreted as the visible states of the RBM
  288.        
  289.        Returns:
  290.            self
  291.        """
  292.         n, m = data.shape
  293.         num_batches = n / self.batch_size
  294.         e = 0
  295.        
  296.         if self.verbose:
  297.             start_time = time.clock()
  298.        
  299.         while e < self.epochs:
  300.             e += 1
  301.             batches = make_batches(data, self.batch_size)
  302.             for i, (batch, subsample_ids) in enumerate(batches):
  303.                 cost, grad = self.cost(batch, subsample_ids)
  304.                 self = self.update(grad, subsample_ids)
  305.                 self.cost_hist.append(cost)
  306.                 if self.verbose:
  307.                     print 'Batch %i - Cost %0.6f\r'%(i+1, cost),
  308.                     stdout.flush()
  309.             if self.verbose:
  310.                 print 'Training Epoch %i'%(self.total_epochs),
  311.                 print 'Average Cost: %0.6f\t\t'%np.mean(self.cost_hist[-num_batches:])
  312.                 stdout.flush()
  313.             self.total_epochs += 1
  314.            
  315.         if self.verbose:
  316.             end_time = time.clock()
  317.             print 'Runtime %0.2fs'%(end_time-start_time)
  318.            
  319.         return self
  320.  
  321. def make_batches(data, batch_size=100):
  322.     """
  323.    Split the data into minibatches of size batch_size
  324.    
  325.    This procedure generates subsamples ids for batches by only considering
  326.    features that are active in the minibatch
  327.    
  328.    Args:
  329.        data - the data to be split into minibatches (must be rank 2)
  330.        batch_size - the size of the minibatches
  331.        
  332.    Returns:
  333.        batches - a list: [(batch, subsample_ids) for batch in minibatchs]
  334.    """
  335.     n = data.shape[0]
  336.     perm = random.permutation(range(n))
  337.     i = 0
  338.     batches = []
  339.     while i < n:
  340.         batch = perm[i:i+batch_size]
  341.         i += batch_size
  342.         batches.append(data[batch])
  343.     try:
  344.         ids = [find((b.sum(0) != 0).A.flatten()) for b in batches]
  345.     except AttributeError:
  346.         ids = [find((b.sum(0) != 0).flatten()) for b in batches]
  347.     batches = [(b[:,i].toarray(), i) for b,i in zip(batches, ids)]
  348.     return batches
  349.    
  350. def sigmoid(X):
  351.     """Compute sigmoid function"""
  352.     return 1 / (1 + np.exp(-X))
  353.  
  354. def sample_bernoulli(X):
  355.     """
  356.    All values of X must be probabilities of independent events occuring according
  357.    to a binomial distribution
  358.    
  359.    Returns an indicator array:
  360.        output[i,j] = 1 iif X[i,j] >= uniform(0, 1)
  361.    """
  362.     return (X >= random.uniform(size=X.shape)).astype('b')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement