rbm_gaussian_2layer.yaml

# This file shows how to train a binary RBM on MNIST by viewing it as a 2 layer DBM.
# It is a variation of rbm.yaml. First, the input layer is a GaussianVisLayer. The inputs are
# not binarized. And it's two layers.
# The hyperparameters in this file aren't especially great; they're mostly chosen to demonstrate
# the interface. Feel free to suggest better hyperparameters!
!obj:pylearn2.train.Train {
    # For this example, we will modify the binarized rbm.yaml to use the
    # raw inputs with a Gaussian Visible Layer and two Binary
    # Vector hidden layers.
    dataset: &data !obj:pylearn2.datasets.mnist.MNIST {
        which_set: 'train',
        one_hot: 1,
        start: 0,
        stop: %(train_stop)i
    },
    model: !obj:pylearn2.models.dbm.DBM {
        batch_size: 100,
        # 1 mean field iteration reaches convergence in the RBM
        niter: 1,
        # The visible layer of this RBM is a GaussianVisLayer
        # This layer was recently repaired in early March 2014
        # to be working again for DBM models
        visible_layer: !obj:pylearn2.models.dbm.GaussianVisLayer {
            nvis: 784,
        },
        hidden_layers: [
            # This RBM has two hidden layer, consisting of a two binary vectors.
            # Optionally, one can do max pooling on top of this vector, but
            # here we don't, by setting pool_size = 1.
            !obj:pylearn2.models.dbm.BinaryVectorMaxPool {
                # Every layer in the DBM must have a layer_name field.
                # These are used to generate unique names of monitoring
                # channels associated with the different layers.
                layer_name: 'h1',
                # The detector layer is the portion of this layer that
                # precedes the pooling. We control its size with this
                # argument.
                detector_layer_dim: %(detector_layer1_dim)i,
                pool_size: 1,
                # We initialize the weights by drawing them from W_ij ~ U(-irange, irange)
                irange: .05,
                # We initialize all the biases of the hidden units to a negative
                # number. This helps to learn a sparse representation.
                init_bias: -2.,
            },
            # This is the second layer, identical to the first. The only difference
            # is the number of units and layer name
            !obj:pylearn2.models.dbm.BinaryVectorMaxPool {
                layer_name: 'h2',
                detector_layer_dim: %(detector_layer2_dim)i,
                pool_size: 1,
                irange: .05,
                init_bias: -2.,
            }
       ]
    },
    # We train the model using stochastic gradient descent.
    # One benefit of using pylearn2 is that we can use the exact same piece of
    # code to train a DBM as to train an MLP. The interface that SGD uses to get
    # the gradient of the cost function from an MLP can also get the *approximate*
    # gradient from a DBM.
    algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
               # We initialize the learning rate and momentum here. Down below
               # we can control the way they decay with various callbacks.
               learning_rate: 1e-3,
               # Compute new model parameters using SGD + Momentum
               learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
                   init_momentum: 0.5,
               },
               # These arguments say to compute the monitoring channels on 10 batches
               # of the training set.
               monitoring_batches: %(monitoring_batches)i,
               monitoring_dataset : *data,
               # The SumOfCosts allows us to add together a few terms to make a complicated
               # cost function.
               cost : !obj:pylearn2.costs.cost.SumOfCosts {
                costs: [
                        # The first term of our cost function is variational PCD.
                        # For the RBM, the variational approximation is exact, so
                        # this is really just PCD. In deeper models, it means we
                        # use mean field rather than Gibbs sampling in the positive phase.
                        !obj:pylearn2.costs.dbm.VariationalPCD {
                           # Here we specify how many fantasy particles to maintain
                           num_chains: 100,
                           # Here we specify how many steps of Gibbs sampling to do between
                           # each parameter update.
                           num_gibbs_steps: 5
                        },
                        # The second term of our cost function is a little bit of weight
                        # decay.
                        # Note since we have 2 layers, we need 2 entries here.
                        !obj:pylearn2.costs.dbm.WeightDecay {
                          coeffs: [ .0001, .0001  ]
                        },
                        # Finally, we regularize the RBM to sparse, using a method copied
                        # from Ruslan Salakhutdinov's DBM demo
                        # Note since we have 2 layers, we need 2 entries here.
                        !obj:pylearn2.costs.dbm.TorontoSparsity {
                         targets: [ .2, .2 ],
                         coeffs: [ .001, .001 ],
                        }
                       ],
           },
           # We tell the RBM to train for 30 epochs
           termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter { max_epochs: %(max_epochs)i },
           update_callbacks: [
                # This callback makes the learning rate shrink by dividing it by decay_factor after
                # each sgd step.
                !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
                        decay_factor: 1.000015,
                        min_lr:       0.0001
                }
           ]
        },
    extensions: [
            # This callback makes the momentum grow to 0.9 linearly. It starts
            # growing at epoch 5 and finishes growing at epoch 6.
            !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
                final_momentum: .9,
                start: 5,
                saturate: 6
            },
    ],
    save_path: "%(save_path)s/dbm_gaussian_2layer.pkl",
    # This says to save it every epoch
    save_freq : 1
}