Guest User

Untitled

a guest
May 5th, 2016
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.33 KB | None | 0 0
  1. """
  2. Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
  3. BSD License
  4. """
  5. import numpy as np
  6.  
  7. # data I/O
  8. data = open('input.txt', 'r').read() # should be simple plain text file
  9. chars = list(set(data))
  10. data_size, vocab_size = len(data), len(chars)
  11. print 'data has %d characters, %d unique.' % (data_size, vocab_size)
  12. char_to_ix = { ch:i for i,ch in enumerate(chars) }
  13. ix_to_char = { i:ch for i,ch in enumerate(chars) }
  14.  
  15. # hyperparameters
  16. hidden_size = 100 # size of hidden layer of neurons
  17. seq_length = 25 # number of steps to unroll the RNN for
  18. learning_rate = 1e-1
  19.  
  20. # model parameters
  21. Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
  22. Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
  23. Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
  24. bh = np.zeros((hidden_size, 1)) # hidden bias
  25. by = np.zeros((vocab_size, 1)) # output bias
  26.  
  27. def lossFun(inputs, targets, hprev):
  28. """
  29. inputs,targets are both list of integers.
  30. hprev is Hx1 array of initial hidden state
  31. returns the loss, gradients on model parameters, and last hidden state
  32. """
  33. xs, hs, ys, ps = {}, {}, {}, {}
  34. hs[-1] = np.copy(hprev)
  35. loss = 0
  36. # forward pass
  37. for t in xrange(len(inputs)):
  38. xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
  39. xs[t][inputs[t]] = 1
  40. hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
  41. ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
  42. ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
  43. loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  44. # backward pass: compute gradients going backwards
  45. dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  46. dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  47. dhnext = np.zeros_like(hs[0])
  48. for t in reversed(xrange(len(inputs))):
  49. dy = np.copy(ps[t])
  50. dy[targets[t]] -= 1 # backprop into y
  51. dWhy += np.dot(dy, hs[t].T)
  52. dby += dy
  53. dh = np.dot(Why.T, dy) + dhnext # backprop into h
  54. dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
  55. dbh += dhraw
  56. dWxh += np.dot(dhraw, xs[t].T)
  57. dWhh += np.dot(dhraw, hs[t-1].T)
  58. dhnext = np.dot(Whh.T, dhraw)
  59. for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
  60. np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  61. return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
  62.  
  63. def sample(h, seed_ix, n):
  64. """
  65. sample a sequence of integers from the model
  66. h is memory state, seed_ix is seed letter for first time step
  67. """
  68. x = np.zeros((vocab_size, 1))
  69. x[seed_ix] = 1
  70. ixes = []
  71. for t in xrange(n):
  72. h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
  73. y = np.dot(Why, h) + by
  74. p = np.exp(y) / np.sum(np.exp(y))
  75. ix = np.random.choice(range(vocab_size), p=p.ravel())
  76. x = np.zeros((vocab_size, 1))
  77. x[ix] = 1
  78. ixes.append(ix)
  79. return ixes
  80.  
  81. n, p = 0, 0
  82. mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  83. mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
  84. smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
  85. while True:
  86. # prepare inputs (we're sweeping from left to right in steps seq_length long)
  87. if p+seq_length+1 >= len(data) or n == 0:
  88. hprev = np.zeros((hidden_size,1)) # reset RNN memory
  89. p = 0 # go from start of data
  90. inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  91. targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
  92.  
  93. # sample from the model now and then
  94. if n % 100 == 0:
  95. sample_ix = sample(hprev, inputs[0], 200)
  96. txt = ''.join(ix_to_char[ix] for ix in sample_ix)
  97. print '----\n %s \n----' % (txt, )
  98.  
  99. # forward seq_length characters through the net and fetch gradient
  100. loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  101. smooth_loss = smooth_loss * 0.999 + loss * 0.001
  102. if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
  103.  
  104. # perform parameter update with Adagrad
  105. for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
  106. [dWxh, dWhh, dWhy, dbh, dby],
  107. [mWxh, mWhh, mWhy, mbh, mby]):
  108. mem += dparam * dparam
  109. param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
  110.  
  111. p += seq_length # move data pointer
  112. n += 1 # iteration counter
Add Comment
Please, Sign In to add comment