rbm2.yaml

# This file shows how to train a binary RBM on MNIST by viewing it as a single layer DBM.
# The hyperparameters in this file aren't especially great; they're mostly chosen to demonstrate
# the interface. Feel free to suggest better hyperparameters!
!obj:pylearn2.train.Train {
    dataset: &data !obj:pylearn2.datasets.transformer_dataset.TransformerDataset {
        raw: &raw_data !obj:pylearn2.datasets.binarizer.Binarizer {
            raw: &raw_train !obj:pylearn2.datasets.mnist.MNIST {
                which_set: "train",
            }
        },

        transformer: !obj:pylearn2.models.DBNSampler.DBNSampler {
            rbm_list: [!pkl: "rbm.pkl"]
        }
    },
    model: !obj:pylearn2.models.dbm.DBM {
        batch_size: 100,
        # 1 mean field iteration reaches convergence in the RBM
        niter: 1,
        # The visible layer of this RBM is just a binary vector
        # (as opposed to a binary image for convolutional models,
        # a Gaussian distributed vector, etc.)
        visible_layer: !obj:pylearn2.models.dbm.BinaryVector {
            nvis: 500,
            # We can initialize the biases of the visible units
            # so that sigmoid(b_i) = E[v_i] where the expectation
            # is taken over the dataset. This should get the biases
            # about correct from the start and helps speed up learning.
            bias_from_marginals: *raw_train,
        },
        hidden_layers: [
            # This RBM has one hidden layer, consisting of a binary vector.
            # Optionally, one can do max pooling on top of this vector, but
            # here we don't, by setting pool_size = 1.
            !obj:pylearn2.models.dbm.BinaryVectorMaxPool {
                # Every layer in the DBM must have a layer_name field.
                # These are used to generate unique names of monitoring
                # channels associated with the different layers.
                layer_name: 'h',
                # The detector layer is the portion of this layer that
                # precedes the pooling. We control its size with this
                # argument. Here we request 500 hidden units.
                detector_layer_dim: 500,
                pool_size: 1,
                # We initialize the weights by drawing them from W_ij ~ U(-irange, irange)
                irange: .05,
                # We initialize all the biases of the hidden units to a negative
                # number. This helps to learn a sparse representation.
                init_bias: -2.,
            }
       ]
    },
    # We train the model using stochastic gradient descent.
    # One benefit of using pylearn2 is that we can use the exact same piece of
    # code to train a DBM as to train an MLP. The interface that SGD uses to get
    # the gradient of the cost function from an MLP can also get the *approximate*
    # gradient from a DBM.
    algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
               # We initialize the learning rate and momentum here. Down below
               # we can control the way they decay with various callbacks.
               learning_rate: 1e-3,
               init_momentum: .5,
               # These arguments say to compute the monitoring channels on 10 batches
               # of the training set.
               monitoring_batches: 10,
               monitoring_dataset : *data,
               # The SumOfCosts allows us to add together a few terms to make a complicated
               # cost function.
               cost : !obj:pylearn2.costs.cost.SumOfCosts {
                costs: [
                        # The first term of our cost function is variational PCD.
                        # For the RBM, the variational approximation is exact, so
                        # this is really just PCD. In deeper models, it means we
                        # use mean field rather than Gibbs sampling in the positive phase.
                        !obj:pylearn2.costs.dbm.VariationalPCD {
                           # Here we specify how many fantasy particles to maintain
                           num_chains: 100,
                           # Here we specify how many steps of Gibbs sampling to do between
                           # each parameter update.
                           num_gibbs_steps: 5
                        },
                        # The second term of our cost function is a little bit of weight
                        # decay.
                        !obj:pylearn2.costs.dbm.WeightDecay {
                          coeffs: [ .0001  ]
                        },
                        # Finally, we regularize the RBM to sparse, using a method copied
                        # from Ruslan Salakhutdinov's DBM demo
                        !obj:pylearn2.costs.dbm.TorontoSparsity {
                         targets: [ .2 ],
                         coeffs: [ .001 ],
                        }
                       ],
           },
           # We tell the RBM to train for 300 epochs
           termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter { max_epochs: 1 },
           update_callbacks: [
                # This callback makes the learning rate shrink by dividing it by decay_factor after
                # each sgd step.
                !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
                        decay_factor: 1.000015,
                        min_lr:       0.0001
                }
           ]
        },
    extensions: [
            # This callback makes the momentum grow to 0.9 linearly. It starts
            # growing at epoch 5 and finishes growing at epoch 6.
            !obj:pylearn2.training_algorithms.sgd.MomentumAdjustor {
                final_momentum: .9,
                start: 5,
                saturate: 6
            },
    ],
    # This saves to save the trained model to the same path as
    # the yaml file, but replacing .yaml with .pkl
    save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
    # This says to save it every epoch
    save_freq : 1
}