Advertisement
Guest User

Untitled

a guest
Nov 14th, 2016
149
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.33 KB | None | 0 0
  1. import tensorflow as tf
  2. from six.moves.urllib.request import urlretrieve
  3. import os
  4. import tarfile
  5. import sys
  6. import random
  7. import math
  8. import collections
  9. import numpy as np
  10.  
  11. url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
  12. filename = 'simple-examples.tgz'
  13.  
  14.  
  15. def get_ptb_dataset(filename) :
  16. if not os.path.exists(filename) :
  17. print ('Attempting to download')
  18. filename, _ = urlretrieve(url, filename)
  19. print ("Download Complete")
  20. statinfo = os.stat(filename)
  21. return filename
  22.  
  23. filename = get_ptb_dataset(filename)
  24.  
  25. def extract(filename) :
  26. root = os.path.splitext(filename)[0]
  27. if os.path.isdir(root):
  28. # You may override by setting force=True.
  29. print('%s already present - Skipping extraction of %s.' % (root, filename))
  30. return root
  31. else:
  32. print('Extracting data for %s. This may take a while. Please wait.' % root)
  33. tar = tarfile.open(filename)
  34. sys.stdout.flush()
  35. tar.extractall()
  36. tar.close()
  37. return root
  38. filename = extract(filename)
  39. filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
  40. train_filename = os.path.join(filename, 'data/ptb.train.txt')
  41. valid_filename = os.path.join(filename, 'data/ptb.valid.txt')
  42. test_filename = os.path.join(filename, 'data/ptb.test.txt')
  43.  
  44. print (train_filename)
  45. print (valid_filename)
  46. print (test_filename)
  47.  
  48. def read_words(filename) :
  49. with tf.gfile.GFile(filename, "r") as f:
  50. return f.read().decode('utf-8').replace("\n", "<eos>").split()
  51.  
  52. def build_vocab(filename) :
  53. data = read_words(filename)
  54. print ("No of words ", len(data))
  55. counter = collections.Counter(data)
  56. count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
  57. print (count_pairs)
  58. words, _ = list(zip(*count_pairs))
  59. words_to_id = dict(zip(words, range(len(words))))
  60. return words_to_id
  61.  
  62. def file_to_word_ids(filename, word_to_id) :
  63. data = read_words(filename)
  64. return [word_to_id[word] for word in data if word in word_to_id]
  65.  
  66. word_to_id = build_vocab(train_filename)
  67. train_dataset = file_to_word_ids(train_filename, word_to_id)
  68. valid_dataset = file_to_word_ids(valid_filename, word_to_id)
  69. test_dataset = file_to_word_ids(test_filename, word_to_id)
  70. vocabulary_size = len(word_to_id)
  71. print ('Vocabulary Size', vocabulary_size)
  72.  
  73. def logprob(predictions, labels):
  74. """Log-probability of the true labels in a predicted batch."""
  75. #predictions[predictions < 1e-10] = 1e-10
  76. return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
  77.  
  78. batch_size = 64
  79. data_index = 0
  80. unrollings = 5
  81.  
  82. def generate_batches(raw_data, batch_size, unrollings):
  83. global data_index
  84. data_len = len(raw_data)
  85. num_batches = data_len // batch_size
  86. #batch = dict.fromkeys([i for i in range(num_batches)])
  87. #labels = dict.fromkeys([i for i in range(num_batches)])
  88. #keyDict = [i for i in xrange(num_batches)]
  89. inputs = []
  90. labels = []
  91. #batch = np.ndarray(shape=(batch_size), dtype=np.float)
  92. #label = np.zeros(shape=(batch_size, 1), dtype=np.float)
  93. #print (num_batches, data_len, batch_size)
  94. #for j in xrange(unrollings) :
  95. inputs.append([])
  96. labels.append([])
  97. for i in xrange(batch_size) :
  98. inputs[0].append(raw_data[i + data_index])
  99. #labels[j].append(raw_data[i + data_index + 1])
  100. #batch[i] = raw_data[i + data_index]
  101. labels[0].append(one_hot(raw_data[i + data_index + 1]))
  102. #inputs[j].append([])
  103. #labels[j].append([])
  104. data_index = (data_index + 1) % len(raw_data)
  105. #print (len(inputs), len(inputs[0]), len(labels), len(labels[0]))
  106. #inputs[j].append(batch)
  107. #labels[0].append(label.tolist())
  108.  
  109. return inputs, labels
  110.  
  111. def one_hot(x) :
  112. rep = np.zeros((vocabulary_size))
  113. for k in xrange(vocabulary_size) :
  114. if k!=x :
  115. rep[k] = 0.0
  116. else :
  117. rep[k] = 1.0
  118. return rep
  119.  
  120. train_input, train_labels = generate_batches(train_dataset, 50, unrollings=5)
  121. #train_input = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32)
  122. #train_labels = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32)
  123.  
  124. '''def model_lstm():
  125. num_hidden = 32
  126. lstm = tf.nn.rnn.cell.LSTMCell(num_hidden, forget_bias=1.0)
  127. state = tf.zeros([batch_size, lstm.state_size])
  128. probabilities = []
  129. loss = 0.0
  130. for
  131. '''
  132. def id_to_word(i) :
  133. for word in word_to_id :
  134. if i == word_to_id[word] :
  135. #print (word_to_id[word])
  136. return word
  137.  
  138. def sample_distribution(distribution):
  139. """Sample one element from a distribution assumed to be an array of normalized
  140. probabilities.
  141. """
  142. r = random.uniform(0, 1)
  143. s = 0
  144. for i in range(len(distribution)):
  145. s += distribution[i]
  146. if s >= r:
  147. return i
  148. return len(distribution) - 1
  149.  
  150. def sample(prediction):
  151. """Turn a (column) prediction into 1-hot encoded samples."""
  152. p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  153. p[0, sample_distribution(prediction[0])] = 1.0
  154. return p
  155.  
  156. def random_distribution():
  157. """Generate a random column of probabilities."""
  158. b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  159. #print (b/np.sum(b, 1)[:,None])
  160. return b/np.sum(b, 1)[:,None]
  161.  
  162. def one_hot_to_id(x) :
  163. count = 0
  164. for i in x:
  165. for j in i:
  166. if j==0.0:
  167. count += 1
  168. elif j==1.0:
  169. return count
  170.  
  171.  
  172. embedding_size = 128
  173. num_nodes = 32
  174. graph = tf.Graph()
  175. with graph.as_default():
  176.  
  177. # Parameters:
  178. # Input,Forget,Candidate,Output gate: input, previous output, and bias.
  179. ifcox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
  180. ifcom = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
  181. ifcob = tf.Variable(tf.zeros([1, num_nodes*4]))
  182.  
  183. # Variables saving state across unrollings.
  184. saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  185. saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  186. # Classifier weights and biases.
  187. w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  188. b = tf.Variable(tf.zeros([vocabulary_size]))
  189.  
  190. # Definition of the cell computation.
  191. def lstm_cell(i, o, state):
  192. """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
  193. Note that in this formulation, we omit the various connections between the
  194. previous state and the gates."""
  195. embeddings = tf.Variable(
  196. tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  197. embed = tf.nn.embedding_lookup(embeddings, i)
  198. i = tf.to_float(embed)
  199. print (i.get_shape())
  200. combined = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob
  201. input_gate = tf.sigmoid(combined[:, 0:num_nodes])
  202. forget_gate = tf.sigmoid(combined[:, num_nodes:2*num_nodes])
  203. update = tf.sigmoid(combined[:, 2*num_nodes:3*num_nodes])
  204. state = forget_gate * state + input_gate * tf.tanh(update)
  205. output_gate = tf.sigmoid(combined[:, 3*num_nodes:4*num_nodes])
  206. print ("O", output_gate)
  207. return output_gate * tf.tanh(state), state
  208.  
  209. train_data = list()
  210. train_label = list()
  211. for _ in range(unrollings) :
  212. train_data.append(tf.placeholder(shape=[batch_size], dtype=tf.int32))
  213. train_label.append(tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32))
  214. #train_inputs = train_data[:unrollings]
  215. #train_labels = train_label[:unrollings]
  216. train_inputs = tf.placeholder(shape=[batch_size], dtype=tf.int32)
  217. train_labels = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.int32)
  218. print (train_inputs, train_labels)
  219. outputs = list()
  220. output = saved_output
  221. state = saved_state
  222.  
  223. #for i in train_inputs :
  224. output, state = lstm_cell(train_inputs, output, state)
  225. outputs.append(output)
  226.  
  227. # State saving across unrollings.
  228. with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
  229. # Classifier.
  230. print (len(outputs))
  231. print (tf.concat(0, outputs).get_shape())
  232. logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
  233. print (logits.get_shape(), train_labels.get_shape())
  234. print (tf.concat(0, train_labels).get_shape())
  235. loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.to_float(tf.concat(0, train_labels))))
  236. print (loss)
  237.  
  238. # Optimizer.
  239. global_step = tf.Variable(0)
  240. learning_rate = tf.train.exponential_decay(1.0, global_step, 5000, 0.1, staircase=True)
  241. optimizer = tf.train.AdamOptimizer(learning_rate)
  242. gradients, v = zip(*optimizer.compute_gradients(loss))
  243. gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  244. optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
  245.  
  246. # Predictions.
  247. train_prediction = logits
  248.  
  249. sample_input = tf.placeholder(tf.int32, shape=[1])
  250. saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  251. saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  252. reset_sample_state = tf.group(
  253. saved_sample_output.assign(tf.zeros([1, num_nodes])),
  254. saved_sample_state.assign(tf.zeros([1, num_nodes])))
  255. sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
  256. with tf.control_dependencies([saved_sample_output.assign(sample_output),saved_sample_state.assign(sample_state)]):
  257. sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
  258. print ("Done")
  259.  
  260. num_steps = 1001
  261. summary_frequency = 100
  262.  
  263. def accuracy(predictions, labels):
  264. return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])
  265.  
  266. with tf.Session(graph=graph) as session:
  267. tf.initialize_all_variables().run()
  268. print('Initialized')
  269. mean_loss = 0
  270. for step in xrange(num_steps) :
  271. #print ("Train data ",len(train_data))
  272. batch_inputs, batch_labels = generate_batches(train_dataset, batch_size=64, unrollings=5)
  273. #print (len(batch_inputs),len(batch_inputs[0]), len(batch_labels), len(batch_labels[0]))
  274. #print (batch_inputs, batch_labels)
  275. feed_dict = dict()
  276. batch_inputs = np.reshape(batch_inputs, (batch_size))
  277. batch_labels = np.reshape(batch_labels, (batch_size, vocabulary_size))
  278. batch_inputs = np.array(batch_inputs).astype('int32')
  279. batch_labels = batch_labels.astype('float32')
  280. #print (batch_labels.shape, batch_labels.dtype, len(batch_inputs), batch_inputs.shape)
  281. #print (train_inputs, train_labels)
  282. feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
  283. _, l, lo, predictions, lr = session.run([optimizer, loss, logits, train_prediction, learning_rate], feed_dict=feed_dict)
  284. mean_loss += l
  285. if step % summary_frequency == 0:
  286. if step >= 0:
  287. print ("Loss :", mean_loss)
  288. mean_loss = mean_loss / summary_frequency
  289. # The mean loss is an estimate of the loss over the last few batches.
  290. print (batch_inputs, batch_labels)
  291. print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
  292. print (lo)
  293. label = batch_labels
  294. print ("L: ", label)
  295. print ("P:", predictions)
  296. print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, label))))
  297. #if step % (summary_frequency * 10) == 0:
  298. # Generate some samples.
  299. print('=' * 80)
  300. feed = sample(random_distribution())
  301. print (feed)
  302. feed1 = one_hot_to_id(feed)
  303. print (feed1)
  304. print ("f",id_to_word(feed1))
  305. sentence = id_to_word(feed1)
  306. print ("as",sentence)
  307. reset_sample_state.run()
  308. feed = feed1
  309. feed = np.reshape(feed, (1))
  310. print ("feed", feed)
  311. for _ in range(100):
  312. prediction = sample_prediction.eval({sample_input: feed})
  313. feed = sample(prediction)
  314. feed1 = one_hot_to_id(feed)
  315. #print (feed1)
  316. #print ("f",id_to_word(feed1))
  317. sentence += id_to_word(feed1)
  318. sentence = sentence + ' '
  319. feed = feed1
  320. feed = np.reshape(feed, (1))
  321. print("Sentence :",sentence)
  322. print('=' * 80)
  323. # Measure validation set perplexity.
  324. reset_sample_state.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement