Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- from six.moves.urllib.request import urlretrieve
- import os
- import tarfile
- import sys
- import random
- import math
- import collections
- import numpy as np
- url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
- filename = 'simple-examples.tgz'
- def get_ptb_dataset(filename) :
- if not os.path.exists(filename) :
- print ('Attempting to download')
- filename, _ = urlretrieve(url, filename)
- print ("Download Complete")
- statinfo = os.stat(filename)
- return filename
- filename = get_ptb_dataset(filename)
- def extract(filename) :
- root = os.path.splitext(filename)[0]
- if os.path.isdir(root):
- # You may override by setting force=True.
- print('%s already present - Skipping extraction of %s.' % (root, filename))
- return root
- else:
- print('Extracting data for %s. This may take a while. Please wait.' % root)
- tar = tarfile.open(filename)
- sys.stdout.flush()
- tar.extractall()
- tar.close()
- return root
- filename = extract(filename)
- filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
- train_filename = os.path.join(filename, 'data/ptb.train.txt')
- valid_filename = os.path.join(filename, 'data/ptb.valid.txt')
- test_filename = os.path.join(filename, 'data/ptb.test.txt')
- print (train_filename)
- print (valid_filename)
- print (test_filename)
- def read_words(filename) :
- with tf.gfile.GFile(filename, "r") as f:
- return f.read().decode('utf-8').replace("\n", "<eos>").split()
- def build_vocab(filename) :
- data = read_words(filename)
- print ("No of words ", len(data))
- counter = collections.Counter(data)
- count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
- print (count_pairs)
- words, _ = list(zip(*count_pairs))
- words_to_id = dict(zip(words, range(len(words))))
- return words_to_id
- def file_to_word_ids(filename, word_to_id) :
- data = read_words(filename)
- return [word_to_id[word] for word in data if word in word_to_id]
- word_to_id = build_vocab(train_filename)
- train_dataset = file_to_word_ids(train_filename, word_to_id)
- valid_dataset = file_to_word_ids(valid_filename, word_to_id)
- test_dataset = file_to_word_ids(test_filename, word_to_id)
- vocabulary_size = len(word_to_id)
- print ('Vocabulary Size', vocabulary_size)
- def logprob(predictions, labels):
- """Log-probability of the true labels in a predicted batch."""
- #predictions[predictions < 1e-10] = 1e-10
- return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
- batch_size = 64
- data_index = 0
- unrollings = 5
- def generate_batches(raw_data, batch_size, unrollings):
- global data_index
- data_len = len(raw_data)
- num_batches = data_len // batch_size
- #batch = dict.fromkeys([i for i in range(num_batches)])
- #labels = dict.fromkeys([i for i in range(num_batches)])
- #keyDict = [i for i in xrange(num_batches)]
- inputs = []
- labels = []
- #batch = np.ndarray(shape=(batch_size), dtype=np.float)
- #label = np.zeros(shape=(batch_size, 1), dtype=np.float)
- #print (num_batches, data_len, batch_size)
- #for j in xrange(unrollings) :
- inputs.append([])
- labels.append([])
- for i in xrange(batch_size) :
- inputs[0].append(raw_data[i + data_index])
- #labels[j].append(raw_data[i + data_index + 1])
- #batch[i] = raw_data[i + data_index]
- labels[0].append(one_hot(raw_data[i + data_index + 1]))
- #inputs[j].append([])
- #labels[j].append([])
- data_index = (data_index + 1) % len(raw_data)
- #print (len(inputs), len(inputs[0]), len(labels), len(labels[0]))
- #inputs[j].append(batch)
- #labels[0].append(label.tolist())
- return inputs, labels
- def one_hot(x) :
- rep = np.zeros((vocabulary_size))
- for k in xrange(vocabulary_size) :
- if k!=x :
- rep[k] = 0.0
- else :
- rep[k] = 1.0
- return rep
- train_input, train_labels = generate_batches(train_dataset, 50, unrollings=5)
- #train_input = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32)
- #train_labels = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32)
- '''def model_lstm():
- num_hidden = 32
- lstm = tf.nn.rnn.cell.LSTMCell(num_hidden, forget_bias=1.0)
- state = tf.zeros([batch_size, lstm.state_size])
- probabilities = []
- loss = 0.0
- for
- '''
- def id_to_word(i) :
- for word in word_to_id :
- if i == word_to_id[word] :
- #print (word_to_id[word])
- return word
- def sample_distribution(distribution):
- """Sample one element from a distribution assumed to be an array of normalized
- probabilities.
- """
- r = random.uniform(0, 1)
- s = 0
- for i in range(len(distribution)):
- s += distribution[i]
- if s >= r:
- return i
- return len(distribution) - 1
- def sample(prediction):
- """Turn a (column) prediction into 1-hot encoded samples."""
- p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
- p[0, sample_distribution(prediction[0])] = 1.0
- return p
- def random_distribution():
- """Generate a random column of probabilities."""
- b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
- #print (b/np.sum(b, 1)[:,None])
- return b/np.sum(b, 1)[:,None]
- def one_hot_to_id(x) :
- count = 0
- for i in x:
- for j in i:
- if j==0.0:
- count += 1
- elif j==1.0:
- return count
- embedding_size = 128
- num_nodes = 32
- graph = tf.Graph()
- with graph.as_default():
- # Parameters:
- # Input,Forget,Candidate,Output gate: input, previous output, and bias.
- ifcox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes*4], -0.1, 0.1))
- ifcom = tf.Variable(tf.truncated_normal([num_nodes, num_nodes*4], -0.1, 0.1))
- ifcob = tf.Variable(tf.zeros([1, num_nodes*4]))
- # Variables saving state across unrollings.
- saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
- saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
- # Classifier weights and biases.
- w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
- b = tf.Variable(tf.zeros([vocabulary_size]))
- # Definition of the cell computation.
- def lstm_cell(i, o, state):
- """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
- Note that in this formulation, we omit the various connections between the
- previous state and the gates."""
- embeddings = tf.Variable(
- tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
- embed = tf.nn.embedding_lookup(embeddings, i)
- i = tf.to_float(embed)
- print (i.get_shape())
- combined = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob
- input_gate = tf.sigmoid(combined[:, 0:num_nodes])
- forget_gate = tf.sigmoid(combined[:, num_nodes:2*num_nodes])
- update = tf.sigmoid(combined[:, 2*num_nodes:3*num_nodes])
- state = forget_gate * state + input_gate * tf.tanh(update)
- output_gate = tf.sigmoid(combined[:, 3*num_nodes:4*num_nodes])
- print ("O", output_gate)
- return output_gate * tf.tanh(state), state
- train_data = list()
- train_label = list()
- for _ in range(unrollings) :
- train_data.append(tf.placeholder(shape=[batch_size], dtype=tf.int32))
- train_label.append(tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.float32))
- #train_inputs = train_data[:unrollings]
- #train_labels = train_label[:unrollings]
- train_inputs = tf.placeholder(shape=[batch_size], dtype=tf.int32)
- train_labels = tf.placeholder(shape=[batch_size, vocabulary_size], dtype=tf.int32)
- print (train_inputs, train_labels)
- outputs = list()
- output = saved_output
- state = saved_state
- #for i in train_inputs :
- output, state = lstm_cell(train_inputs, output, state)
- outputs.append(output)
- # State saving across unrollings.
- with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
- # Classifier.
- print (len(outputs))
- print (tf.concat(0, outputs).get_shape())
- logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
- print (logits.get_shape(), train_labels.get_shape())
- print (tf.concat(0, train_labels).get_shape())
- loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.to_float(tf.concat(0, train_labels))))
- print (loss)
- # Optimizer.
- global_step = tf.Variable(0)
- learning_rate = tf.train.exponential_decay(1.0, global_step, 5000, 0.1, staircase=True)
- optimizer = tf.train.AdamOptimizer(learning_rate)
- gradients, v = zip(*optimizer.compute_gradients(loss))
- gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
- optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
- # Predictions.
- train_prediction = logits
- sample_input = tf.placeholder(tf.int32, shape=[1])
- saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
- saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
- reset_sample_state = tf.group(
- saved_sample_output.assign(tf.zeros([1, num_nodes])),
- saved_sample_state.assign(tf.zeros([1, num_nodes])))
- sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
- with tf.control_dependencies([saved_sample_output.assign(sample_output),saved_sample_state.assign(sample_state)]):
- sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
- print ("Done")
- num_steps = 1001
- summary_frequency = 100
- def accuracy(predictions, labels):
- return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])
- with tf.Session(graph=graph) as session:
- tf.initialize_all_variables().run()
- print('Initialized')
- mean_loss = 0
- for step in xrange(num_steps) :
- #print ("Train data ",len(train_data))
- batch_inputs, batch_labels = generate_batches(train_dataset, batch_size=64, unrollings=5)
- #print (len(batch_inputs),len(batch_inputs[0]), len(batch_labels), len(batch_labels[0]))
- #print (batch_inputs, batch_labels)
- feed_dict = dict()
- batch_inputs = np.reshape(batch_inputs, (batch_size))
- batch_labels = np.reshape(batch_labels, (batch_size, vocabulary_size))
- batch_inputs = np.array(batch_inputs).astype('int32')
- batch_labels = batch_labels.astype('float32')
- #print (batch_labels.shape, batch_labels.dtype, len(batch_inputs), batch_inputs.shape)
- #print (train_inputs, train_labels)
- feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
- _, l, lo, predictions, lr = session.run([optimizer, loss, logits, train_prediction, learning_rate], feed_dict=feed_dict)
- mean_loss += l
- if step % summary_frequency == 0:
- if step >= 0:
- print ("Loss :", mean_loss)
- mean_loss = mean_loss / summary_frequency
- # The mean loss is an estimate of the loss over the last few batches.
- print (batch_inputs, batch_labels)
- print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
- print (lo)
- label = batch_labels
- print ("L: ", label)
- print ("P:", predictions)
- print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, label))))
- #if step % (summary_frequency * 10) == 0:
- # Generate some samples.
- print('=' * 80)
- feed = sample(random_distribution())
- print (feed)
- feed1 = one_hot_to_id(feed)
- print (feed1)
- print ("f",id_to_word(feed1))
- sentence = id_to_word(feed1)
- print ("as",sentence)
- reset_sample_state.run()
- feed = feed1
- feed = np.reshape(feed, (1))
- print ("feed", feed)
- for _ in range(100):
- prediction = sample_prediction.eval({sample_input: feed})
- feed = sample(prediction)
- feed1 = one_hot_to_id(feed)
- #print (feed1)
- #print ("f",id_to_word(feed1))
- sentence += id_to_word(feed1)
- sentence = sentence + ' '
- feed = feed1
- feed = np.reshape(feed, (1))
- print("Sentence :",sentence)
- print('=' * 80)
- # Measure validation set perplexity.
- reset_sample_state.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement