Advertisement
Guest User

Untitled

a guest
Aug 27th, 2012
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.83 KB | None | 0 0
  1. """ Theano CRBM implementation.
  2.  
  3. For details, see:
  4. http://www.uoguelph.ca/~gwtaylor/publications/nips2006mhmublv
  5. Sample data:
  6. http://www.uoguelph.ca/~gwtaylor/publications/nips2006mhmublv/motion.mat
  7.  
  8. @author Graham Taylor"""
  9.  
  10. import numpy
  11. import numpy as np
  12. import matplotlib.pyplot as plt
  13. import time
  14.  
  15. import theano
  16. import theano.tensor as T
  17.  
  18. from theano.tensor.shared_randomstreams import RandomStreams
  19.  
  20. from motion import load_data
  21.  
  22.  
  23. class CRBM(object):
  24. """Conditional Restricted Boltzmann Machine (CRBM) """
  25. def __init__(self, input=None, input_history=None, n_visible=49,
  26. n_hidden=500, delay=6, A=None, B=None, W=None, hbias=None,
  27. vbias=None, numpy_rng=None,
  28. theano_rng=None):
  29. """
  30. CRBM constructor. Defines the parameters of the model along with
  31. basic operations for inferring hidden from visible (and vice-versa),
  32. as well as for performing CD updates.
  33.  
  34. :param input: None for standalone RBMs or symbolic variable if RBM is
  35. part of a larger graph.
  36.  
  37. :param n_visible: number of visible units
  38.  
  39. :param n_hidden: number of hidden units
  40.  
  41. :param A: None for standalone CRBMs or symbolic variable pointing to a
  42. shared weight matrix in case CRBM is part of a CDBN network; in a CDBN,
  43. the weights are shared between CRBMs and layers of a MLP
  44.  
  45. :param B: None for standalone CRBMs or symbolic variable pointing to a
  46. shared weight matrix in case CRBM is part of a CDBN network; in a CDBN,
  47. the weights are shared between CRBMs and layers of a MLP
  48.  
  49. :param W: None for standalone CRBMs or symbolic variable pointing to a
  50. shared weight matrix in case CRBM is part of a CDBN network; in a CDBN,
  51. the weights are shared between CRBMs and layers of a MLP
  52.  
  53. :param hbias: None for standalone CRBMs or symbolic variable pointing
  54. to a shared hidden units bias vector in case CRBM is part of a
  55. different network
  56.  
  57. :param vbias: None for standalone RBMs or a symbolic variable
  58. pointing to a shared visible units bias
  59. """
  60.  
  61. self.n_visible = n_visible
  62. self.n_hidden = n_hidden
  63. self.delay = delay
  64.  
  65. if numpy_rng is None:
  66. # create a number generator
  67. numpy_rng = numpy.random.RandomState(1234)
  68.  
  69. if theano_rng is None:
  70. theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
  71.  
  72. if W is None:
  73. # the output of uniform if converted using asarray to dtype
  74. # theano.config.floatX so that the code is runable on GPU
  75. initial_W = np.asarray(0.01 * numpy_rng.randn(n_visible,
  76. n_hidden),
  77. dtype=theano.config.floatX)
  78. # theano shared variables for weights and biases
  79. W = theano.shared(value=initial_W, name='W')
  80.  
  81. if A is None:
  82. initial_A = np.asarray(0.01 * numpy_rng.randn(n_visible * delay,
  83. n_visible),
  84. dtype=theano.config.floatX)
  85. # theano shared variables for weights and biases
  86. A = theano.shared(value=initial_A, name='A')
  87.  
  88. if B is None:
  89. initial_B = np.asarray(0.01 * numpy_rng.randn(n_visible * delay,
  90. n_hidden),
  91. dtype=theano.config.floatX)
  92. # theano shared variables for weights and biases
  93. B = theano.shared(value=initial_B, name='B')
  94.  
  95. if hbias is None:
  96. # create shared variable for hidden units bias
  97. hbias = theano.shared(value=numpy.zeros(n_hidden,
  98. dtype=theano.config.floatX), name='hbias')
  99.  
  100. if vbias is None:
  101. # create shared variable for visible units bias
  102. vbias = theano.shared(value=numpy.zeros(n_visible,
  103. dtype=theano.config.floatX), name='vbias')
  104.  
  105. # initialize input layer for standalone CRBM or layer0 of CDBN
  106. self.input = input
  107. if not input:
  108. self.input = T.matrix('input')
  109.  
  110. self.input_history = input_history
  111. if not input_history:
  112. self.input_history = T.matrix('input_history')
  113.  
  114. self.W = W
  115. self.A = A
  116. self.B = B
  117. self.hbias = hbias
  118. self.vbias = vbias
  119. self.theano_rng = theano_rng
  120. # **** WARNING: It is not a good idea to put things in this list
  121. # other than shared variables created in this function.
  122. self.params = [self.W, self.A, self.B, self.hbias, self.vbias]
  123.  
  124. def free_energy(self, v_sample, v_history):
  125. ''' Function to compute the free energy of a sample conditional
  126. on the history '''
  127. wx_b = T.dot(v_sample, self.W) + T.dot(v_history, self.B) + self.hbias
  128. ax_b = T.dot(v_history, self.A) + self.vbias
  129. visible_term = T.sum(0.5 * T.sqr(v_sample - ax_b), axis=1)
  130. hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
  131.  
  132. return visible_term - hidden_term
  133.  
  134. def propup(self, vis, v_history):
  135. ''' This function propagates the visible units activation upwards to
  136. the hidden units
  137.  
  138. Note that we return also the pre-sigmoid activation of the layer. As
  139. it will turn out later, due to how Theano deals with optimizations,
  140. this symbolic variable will be needed to write down a more
  141. stable computational graph (see details in the reconstruction cost
  142. function)
  143. '''
  144. pre_sigmoid_activation = T.dot(vis, self.W) + \
  145. T.dot(v_history, self.B) + self.hbias
  146. return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
  147.  
  148. def sample_h_given_v(self, v0_sample, v_history):
  149. ''' This function infers state of hidden units given visible units '''
  150. # compute the activation of the hidden units given a sample of the
  151. # visibles
  152. #pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
  153. pre_sigmoid_h1, h1_mean = self.propup(v0_sample, v_history)
  154. # get a sample of the hiddens given their activation
  155. # Note that theano_rng.binomial returns a symbolic sample of dtype
  156. # int64 by default. If we want to keep our computations in floatX
  157. # for the GPU we need to specify to return the dtype floatX
  158. h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1,
  159. p=h1_mean,
  160. dtype=theano.config.floatX)
  161. return [pre_sigmoid_h1, h1_mean, h1_sample]
  162.  
  163. def propdown(self, hid, v_history):
  164. '''This function propagates the hidden units activation downwards to
  165. the visible units
  166.  
  167. Note that we return also the pre_sigmoid_activation of the layer. As
  168. it will turn out later, due to how Theano deals with optimizations,
  169. this symbolic variable will be needed to write down a more
  170. stable computational graph (see details in the reconstruction cost
  171. function)
  172. '''
  173. mean_activation = T.dot(hid, self.W.T) + T.dot(v_history, self.A) + \
  174. self.vbias
  175. return mean_activation
  176.  
  177. def sample_v_given_h(self, h0_sample, v_history):
  178. ''' This function infers state of visible units given hidden units '''
  179. # compute the activation of the visible given the hidden sample
  180. #pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
  181. v1_mean = self.propdown(h0_sample, v_history)
  182. # get a sample of the visible given their activation
  183. # Note that theano_rng.binomial returns a symbolic sample of dtype
  184. # int64 by default. If we want to keep our computations in floatX
  185. # for the GPU we need to specify to return the dtype floatX
  186. #v1_sample = self.theano_rng.binomial(size=v1_mean.shape,
  187. # n=1, p=v1_mean,
  188. # dtype = theano.config.floatX)
  189. v1_sample = v1_mean # mean-field
  190. return [v1_mean, v1_sample]
  191.  
  192. def gibbs_hvh(self, h0_sample, v_history):
  193. ''' This function implements one step of Gibbs sampling,
  194. starting from the hidden state'''
  195. v1_mean, v1_sample = self.sample_v_given_h(h0_sample, v_history)
  196. pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample,
  197. v_history)
  198.  
  199. return [v1_mean, v1_sample, pre_sigmoid_h1, h1_mean, h1_sample]
  200.  
  201. def gibbs_vhv(self, v0_sample, v_history):
  202. ''' This function implements one step of Gibbs sampling,
  203. starting from the visible state'''
  204. #pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
  205. #pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
  206. pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample,
  207. v_history)
  208. v1_mean, v1_sample = self.sample_v_given_h(h1_sample, v_history)
  209.  
  210. return [pre_sigmoid_h1, h1_mean, h1_sample, v1_mean, v1_sample]
  211.  
  212. def get_cost_updates(self, lr=0.1, k=1):
  213. """
  214. This functions implements one step of CD-k
  215.  
  216. :param lr: learning rate used to train the RBM
  217.  
  218. :param persistent: None for CD
  219.  
  220. :param k: number of Gibbs steps to do in CD-k
  221.  
  222. Returns a proxy for the cost and the updates dictionary. The
  223. dictionary contains the update rules for weights and biases but
  224. also an update of the shared variable used to store the persistent
  225. chain, if one is used.
  226. """
  227.  
  228. # compute positive phase
  229. pre_sigmoid_ph, ph_mean, ph_sample = \
  230. self.sample_h_given_v(self.input, self.input_history)
  231.  
  232. # for CD, we use the newly generate hidden sample
  233. chain_start = ph_sample
  234.  
  235. # perform actual negative phase
  236. # in order to implement CD-k we need to scan over the
  237. # function that implements one gibbs step k times.
  238. # Read Theano tutorial on scan for more information :
  239. # http://deeplearning.net/software/theano/library/scan.html
  240. # the scan will return the entire Gibbs chain
  241. # updates dictionary is important because it contains the updates
  242. # for the random number generator
  243. [nv_means, nv_samples, pre_sigmoid_nhs, nh_means,
  244. nh_samples], updates = theano.scan(self.gibbs_hvh,
  245. # the None are place holders, saying that
  246. # chain_start is the initial state corresponding to the
  247. # 5th output
  248. outputs_info=[None, None, None, None, chain_start],
  249. non_sequences=self.input_history,
  250. n_steps=k)
  251.  
  252. # determine gradients on CRBM parameters
  253. # not that we only need the sample at the end of the chain
  254. chain_end = nv_samples[-1]
  255.  
  256. cost = T.mean(self.free_energy(self.input, self.input_history)) - \
  257. T.mean(self.free_energy(chain_end, self.input_history))
  258. # We must not compute the gradient through the gibbs sampling
  259. gparams = T.grad(cost, self.params, consider_constant=[chain_end])
  260.  
  261. # constructs the update dictionary
  262. for gparam, param in zip(gparams, self.params):
  263. # make sure that the learning rate is of the right dtype
  264. if param == self.A:
  265. # slow down autoregressive updates
  266. updates[param] = param - gparam * 0.01 * \
  267. T.cast(lr, dtype=theano.config.floatX)
  268. else:
  269. updates[param] = param - gparam * \
  270. T.cast(lr, dtype=theano.config.floatX)
  271.  
  272. # reconstruction error is a better proxy for CD
  273. monitoring_cost = self.get_reconstruction_cost(updates, nv_means[-1])
  274.  
  275. return monitoring_cost, updates
  276.  
  277. def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
  278. """Approximation to the reconstruction error
  279. """
  280. # sum over dimensions, mean over cases
  281. recon = T.mean(T.sum(T.sqr(self.input - pre_sigmoid_nv), axis=1))
  282.  
  283. return recon
  284.  
  285. def generate(self, orig_data, orig_history, n_samples, n_gibbs=30):
  286. """ Given initialization(s) of visibles and matching history, generate
  287. n_samples in future.
  288.  
  289. orig_data : n_seq by n_visibles array
  290. initialization for first frame
  291. orig_history : n_seq by delay * n_visibles array
  292. delay-step history
  293. n_samples : int
  294. number of samples to generate forward
  295. n_gibbs : int
  296. number of alternating Gibbs steps per iteration"""
  297. n_seq = orig_data.shape[0]
  298. persistent_vis_chain = theano.shared(orig_data)
  299. persistent_history = theano.shared(orig_history)
  300.  
  301. #persistent_history = T.matrix('persistent_history')
  302.  
  303. [presig_hids, hid_mfs, hid_samples, vis_mfs, vis_samples], updates = \
  304. theano.scan(crbm.gibbs_vhv,
  305. outputs_info=[None, None, None, None,
  306. persistent_vis_chain],
  307. non_sequences=persistent_history,
  308. n_steps=n_gibbs)
  309.  
  310. # add to updates the shared variable that takes care of our persistent
  311. # chain
  312. # initialize next visible with current visible
  313. # shift the history one step forward
  314. updates.update({persistent_vis_chain: vis_samples[-1],
  315. persistent_history: T.concatenate(
  316. (vis_samples[-1],
  317. persistent_history[:, :(self.delay - 1) * \
  318. self.n_visible],
  319. ), axis=1)})
  320. # construct the function that implements our persistent chain.
  321. # we generate the "mean field" activations for plotting and the actual
  322. # samples for reinitializing the state of our persistent chain
  323. sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
  324. updates=updates,
  325. name='sample_fn')
  326.  
  327. #vis_mf, vis_sample = sample_fn()
  328. #print orig_data[:,1:5]
  329. #print vis_mf[:,1:5]
  330. generated_series = np.empty((n_seq, n_samples, self.n_visible))
  331. for t in xrange(n_samples):
  332. print "Generating frame %d" % t
  333. vis_mf, vis_sample = sample_fn()
  334. generated_series[:, t, :] = vis_mf
  335. return generated_series
  336.  
  337.  
  338. def train_crbm(learning_rate=1e-3, training_epochs=300,
  339. dataset='../data/motion.mat', batch_size=100,
  340. n_hidden=100, delay=6):
  341. """
  342. Demonstrate how to train a CRBM.
  343. This is demonstrated on mocap data.
  344.  
  345. :param learning_rate: learning rate used for training the CRBM
  346.  
  347. :param training_epochs: number of epochs used for training
  348.  
  349. :param dataset: path the the dataset (matlab format)
  350.  
  351. :param batch_size: size of a batch used to train the RBM
  352.  
  353. """
  354.  
  355. rng = numpy.random.RandomState(123)
  356. theano_rng = RandomStreams(rng.randint(2 ** 30))
  357.  
  358. # batchdata is returned as theano shared variable floatX
  359. batchdata, seqlen, data_mean, data_std = load_data(dataset)
  360.  
  361. # compute number of minibatches for training, validation and testing
  362. n_train_batches = batchdata.get_value(borrow=True).shape[0] / batch_size
  363. n_dim = batchdata.get_value(borrow=True).shape[1]
  364.  
  365. # valid starting indices
  366. batchdataindex = []
  367. last = 0
  368. for s in seqlen:
  369. batchdataindex += range(last + delay, last + s)
  370. last += s
  371.  
  372. permindex = np.array(batchdataindex)
  373. rng.shuffle(permindex)
  374.  
  375. # allocate symbolic variables for the data
  376. index = T.lvector() # index to a [mini]batch
  377. index_hist = T.lvector() # index to history
  378. x = T.matrix('x') # the data
  379. x_history = T.matrix('x_history')
  380.  
  381. #theano.config.compute_test_value='warn'
  382. #x.tag.test_value = np.random.randn(batch_size, n_dim)
  383. #x_history.tag.test_value = np.random.randn(batch_size, n_dim*delay)
  384.  
  385. # initialize storage for the persistent chain
  386. # (state = hidden layer of chain)
  387.  
  388. # construct the CRBM class
  389. crbm = CRBM(input=x, input_history=x_history, n_visible=n_dim, \
  390. n_hidden=n_hidden, delay=delay, numpy_rng=rng,
  391. theano_rng=theano_rng)
  392.  
  393. # get the cost and the gradient corresponding to one step of CD-15
  394. cost, updates = crbm.get_cost_updates(lr=learning_rate, k=1)
  395.  
  396. #################################
  397. # Training the CRBM #
  398. #################################
  399.  
  400. # the purpose of train_crbm is solely to update the CRBM parameters
  401. train_crbm = theano.function([index, index_hist], cost,
  402. updates=updates,
  403. givens={x: batchdata[index], \
  404. x_history: batchdata[index_hist].reshape((
  405. batch_size, delay * n_dim))},
  406. name='train_crbm')
  407.  
  408. plotting_time = 0.
  409. start_time = time.clock()
  410.  
  411. # go through training epochs
  412. for epoch in xrange(training_epochs):
  413.  
  414. # go through the training set
  415. mean_cost = []
  416. for batch_index in xrange(n_train_batches):
  417.  
  418. # indexing is slightly complicated
  419. # build a linear index to the starting frames for this batch
  420. # (i.e. time t) gives a batch_size length array for data
  421. data_idx = permindex[batch_index * batch_size:(batch_index + 1) \
  422. * batch_size]
  423.  
  424. # now build a linear index to the frames at each delay tap
  425. # (i.e. time t-1 to t-delay)
  426. # gives a batch_size x delay array of indices for history
  427. hist_idx = np.array([data_idx - n for n in xrange(1, delay + 1)]).T
  428.  
  429. this_cost = train_crbm(data_idx, hist_idx.ravel())
  430. #print batch_index, this_cost
  431. mean_cost += [this_cost]
  432.  
  433. print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)
  434.  
  435. end_time = time.clock()
  436.  
  437. pretraining_time = (end_time - start_time)
  438.  
  439. print ('Training took %f minutes' % (pretraining_time / 60.))
  440.  
  441. return crbm, batchdata
  442.  
  443.  
  444. if __name__ == '__main__':
  445. crbm, batchdata = train_crbm()
  446.  
  447. # Generate some sequences (in parallel) from CRBM
  448. # Using training data as initialization
  449.  
  450. # pick some starting points for each sequence
  451. data_idx = np.array([100, 200, 400, 600])
  452. orig_data = numpy.asarray(batchdata.get_value(borrow=True)[data_idx],
  453. dtype=theano.config.floatX)
  454.  
  455. hist_idx = np.array([data_idx - n for n in xrange(1, crbm.delay + 1)]).T
  456. hist_idx = hist_idx.ravel()
  457.  
  458. orig_history = numpy.asarray(
  459. batchdata.get_value(borrow=True)[hist_idx].reshape(
  460. (len(data_idx), crbm.delay * crbm.n_visible)),
  461. dtype=theano.config.floatX)
  462.  
  463. generated_series = crbm.generate(orig_data, orig_history, n_samples=100,
  464. n_gibbs=30)
  465. # append initialization
  466. generated_series = np.concatenate((orig_history.reshape(len(data_idx),
  467. crbm.delay,
  468. crbm.n_visible \
  469. )[:, ::-1, :],
  470. generated_series), axis=1)
  471.  
  472. bd = batchdata.get_value(borrow=True)
  473.  
  474. # plot first dimension of each sequence
  475. for i in xrange(len(generated_series)):
  476. # original
  477. start = data_idx[i]
  478. plt.subplot(len(generated_series), 1, i)
  479. plt.plot(bd[start - crbm.delay:start + 100 - crbm.delay, 1],
  480. label='true', linestyle=':')
  481. plt.plot(generated_series[i, :100, 1], label='predicted',
  482. linestyle='-')
  483.  
  484. leg = plt.legend()
  485. ltext = leg.get_texts() # all the text.Text instance in the legend
  486. plt.setp(ltext, fontsize=9)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement