Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from itertools import chain
- from pathlib import Path
- import tensorflow as tf
- from deeppavlov import build_model
- from deeppavlov.core.commands.train import train_evaluate_model_from_config
- from deeppavlov.core.common.registry import register
- from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
- from deeppavlov.core.data.dataset_reader import DatasetReader
- from deeppavlov.core.data.simple_vocab import SimpleVocabulary
- from deeppavlov.core.data.utils import download_decompress
- from deeppavlov.core.models.component import Component
- from deeppavlov.core.models.tf_model import TFModel
- from deeppavlov.models.tokenizers.lazy_tokenizer import LazyTokenizer
- #download_decompress('http://files.deeppavlov.ai/datasets/personachat_v2.tar.gz', './personachat')
- @register('personachat_dataset_reader')
- class PersonaChatDatasetReader(DatasetReader):
- """
- PersonaChat dataset from
- Zhang S. et al. Personalizing Dialogue Agents: I have a dog, do you have pets too?
- https://arxiv.org/abs/1801.07243
- Also, this dataset is used in ConvAI2 http://convai.io/
- This class reads dataset to the following format:
- [{
- 'persona': [list of persona sentences],
- 'x': input utterance,
- 'y': output utterance,
- 'dialog_history': list of previous utterances
- 'candidates': [list of candidate utterances]
- 'y_idx': index of y utt in candidates list
- },
- ...
- ]
- """
- def read(self, dir_path: str, mode='none_original'):
- dir_path = Path(dir_path)
- dataset = {}
- for dt in ['train', 'valid', 'test']:
- dataset[dt] = self._parse_data(dir_path / '{}_{}.txt'.format(dt, mode))
- return dataset
- @staticmethod
- def _parse_data(filename):
- examples = []
- print(filename)
- curr_persona = []
- curr_dialog_history = []
- persona_done = False
- with filename.open('r') as fin:
- for line in fin:
- line = ' '.join(line.strip().split(' ')[1:])
- your_persona_pref = 'your persona: '
- if line[:len(your_persona_pref)] == your_persona_pref and persona_done:
- curr_persona = [line[len(your_persona_pref):]]
- curr_dialog_history = []
- persona_done = False
- elif line[:len(your_persona_pref)] == your_persona_pref:
- curr_persona.append(line[len(your_persona_pref):])
- else:
- persona_done = True
- x, y, _, candidates = line.split('\t')
- candidates = candidates.split('|')
- example = {
- 'persona': curr_persona,
- 'x': x,
- 'y': y,
- 'dialog_history': curr_dialog_history[:],
- 'candidates': candidates,
- 'y_idx': candidates.index(y)
- }
- curr_dialog_history.extend([x, y])
- examples.append(example)
- return examples
- data = PersonaChatDatasetReader().read('./personachat')
- for k in data:
- print(k, len(data[k]))
- print(data['train'][0])
- @register('personachat_iterator')
- class PersonaChatIterator(DataLearningIterator):
- def split(self, *args, **kwargs):
- for dt in ['train', 'valid', 'test']:
- setattr(self, dt, self._to_tuple(getattr(self, dt)))
- @staticmethod
- def _to_tuple(data):
- """
- Returns:
- list of (x, y)
- """
- return list(map(lambda x: (x['x'], x['y']), data))
- iterator = PersonaChatIterator(data)
- batch = [el for el in iterator.gen_batches(5, 'train')][0]
- for x, y in zip(*batch):
- print('x:', x)
- print('y:', y)
- print('----------')
- tokenizer = LazyTokenizer()
- tokenizer(['Hello my friend'])
- @register('dialog_vocab')
- class DialogVocab(SimpleVocabulary):
- def fit(self, *args):
- tokens = chain(*args)
- super().fit(tokens)
- def __call__(self, batch, **kwargs):
- indices_batch = []
- for utt in batch:
- tokens = [self[token] for token in utt]
- indices_batch.append(tokens)
- return indices_batch
- vocab = DialogVocab(
- save_path='./vocab.dict',
- load_path='./vocab.dict',
- min_freq=2,
- special_tokens=('<PAD>', '<BOS>', '<EOS>', '<UNK>',),
- unk_token='<UNK>'
- )
- vocab.fit(tokenizer(iterator.get_instances(data_type='train')[0]),
- tokenizer(iterator.get_instances(data_type='train')[1]))
- vocab.save()
- vocab.freqs.most_common(10)
- len(vocab)
- vocab([['<BOS>', 'hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this', '<EOS>', '<PAD>']])
- @register('sentence_padder')
- class SentencePadder(Component):
- def __init__(self, length_limit, pad_token_id=0, start_token_id=1, end_token_id=2, *args, **kwargs):
- self.length_limit = length_limit
- self.pad_token_id = pad_token_id
- self.start_token_id = start_token_id
- self.end_token_id = end_token_id
- def __call__(self, batch):
- for i in range(len(batch)):
- batch[i] = batch[i][:self.length_limit]
- batch[i] = [self.start_token_id] + batch[i] + [self.end_token_id]
- batch[i] += [self.pad_token_id] * (self.length_limit + 2 - len(batch[i]))
- return batch
- padder = SentencePadder(length_limit=6)
- vocab(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']])))
- def encoder(inputs, inputs_len, embedding_matrix, cell_size, keep_prob=1.0):
- # inputs: tf.int32 tensor with shape bs x seq_len with token ids
- # inputs_len: tf.int32 tensor with shape bs
- # embedding_matrix: tf.float32 tensor with shape vocab_size x vocab_dim
- # cell_size: hidden size of recurrent cell
- # keep_prob: dropout keep probability
- with tf.variable_scope('encoder'):
- # first of all we should embed every token in input sequence (use tf.nn.embedding_lookup, don't forget about dropout)
- x_emb = tf.nn.dropout(tf.nn.embedding_lookup(embedding_matrix, inputs), keep_prob=keep_prob)
- # define recurrent cell (LSTM or GRU)
- encoder_cell = tf.nn.rnn_cell.GRUCell(
- num_units=cell_size,
- kernel_initializer=tf.contrib.layers.xavier_initializer(),
- name='encoder_cell')
- # use tf.nn.dynamic_rnn to encode input sequence, use actual length of input sequence
- encoder_outputs, encoder_state = tf.nn.dynamic_rnn(cell=encoder_cell, inputs=x_emb, sequence_length=inputs_len,
- dtype=tf.float32)
- return encoder_outputs, encoder_state
- tf.reset_default_graph()
- vocab_size = 100
- hidden_dim = 100
- inputs = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32) # bs x seq_len
- mask = tf.cast(tf.random_uniform(shape=[32, 10]) * 2, tf.int32) # bs x seq_len
- inputs_len = tf.reduce_sum(mask, axis=1)
- embedding_matrix = tf.random_uniform(shape=[vocab_size, hidden_dim])
- encoder(inputs, inputs_len, embedding_matrix, hidden_dim)
- def decoder(encoder_outputs, encoder_state, embedding_matrix, mask,
- cell_size, max_length, y_ph,
- start_token_id=1, keep_prob=1.0,
- teacher_forcing_rate_ph=None,
- use_attention=False, is_train=True):
- # decoder
- # encoder_outputs: tf.float32 tensor with shape bs x seq_len x encoder_cell_size
- # encoder_state: tf.float32 tensor with shape bs x encoder_cell_size
- # embedding_matrix: tf.float32 tensor with shape vocab_size x vocab_dim
- # mask: tf.int32 tensor with shape bs x seq_len with zeros for masked sequence elements
- # cell_size: hidden size of recurrent cell
- # max_length: max length of output sequence
- # start_token_id: id of <BOS> token in vocabulary
- # keep_prob: dropout keep probability
- # teacher_forcing_rate_ph: rate of using teacher forcing on each decoding step
- # use_attention: use attention on encoder outputs or use only encoder_state
- # is_train: is it training or inference? at inference time we can't use teacher forcing
- with tf.variable_scope('decoder'):
- # define decoder recurrent cell
- decoder_cell = tf.nn.rnn_cell.GRUCell(
- num_units=cell_size,
- kernel_initializer=tf.contrib.layers.xavier_initializer(),
- name='decoder_cell')
- # initial value of output_token on previsous step is start_token
- output_token = tf.ones(shape=(tf.shape(encoder_outputs)[0],), dtype=tf.int32) * start_token_id
- # let's define initial value of decoder state with encoder_state
- decoder_state = encoder_state
- pred_tokens = []
- logits = []
- # use for loop to sequentially call recurrent cell
- for i in range(max_length):
- """
- TEACHER FORCING
- # here you can try to implement teacher forcing for your model
- # details about teacher forcing are explained further in tutorial
- # pseudo code:
- NOTE THAT FOLLOWING CONDITIONS SHOULD BE EVALUATED AT GRAPH RUNTIME
- use tf.cond and tf.logical operations instead of python if
- if i > 0 and is_train and random_value < teacher_forcing_rate_ph:
- input_token = y_ph[:, i-1]
- else:
- input_token = output_token
- input_token_emb = tf.nn.embedding_lookup(embedding_matrix, input_token)
- """
- if i > 0:
- input_token_emb = tf.cond(
- tf.logical_and(
- is_train,
- tf.random_uniform(shape=(), maxval=1) <= teacher_forcing_rate_ph
- ),
- lambda: tf.nn.embedding_lookup(embedding_matrix, y_ph[:, i - 1]), # teacher forcing
- lambda: tf.nn.embedding_lookup(embedding_matrix, output_token)
- )
- else:
- input_token_emb = tf.nn.embedding_lookup(embedding_matrix, output_token)
- """
- ATTENTION MECHANISM
- # here you can add attention to your model
- # you can find details about attention further in tutorial
- """
- if use_attention:
- # compute attention and concat attention vector to input_token_emb
- att = dot_attention(encoder_outputs, decoder_state, mask, scope='att')
- input_token_emb = tf.concat([input_token_emb, att], axis=-1)
- input_token_emb = tf.nn.dropout(input_token_emb, keep_prob=keep_prob)
- # call recurrent cell
- decoder_outputs, decoder_state = decoder_cell(input_token_emb, decoder_state)
- decoder_outputs = tf.nn.dropout(decoder_outputs, keep_prob=keep_prob)
- # project decoder output to embeddings dimension
- embeddings_dim = embedding_matrix.get_shape()[1]
- output_proj = tf.layers.dense(decoder_outputs, embeddings_dim, activation=tf.nn.tanh,
- kernel_initializer=tf.contrib.layers.xavier_initializer(),
- name='proj', reuse=tf.AUTO_REUSE)
- # compute logits
- output_logits = tf.matmul(output_proj, embedding_matrix, transpose_b=True)
- logits.append(output_logits)
- output_probs = tf.nn.softmax(output_logits)
- output_token = tf.argmax(output_probs, axis=-1)
- pred_tokens.append(output_token)
- y_pred_tokens = tf.transpose(tf.stack(pred_tokens, axis=0), [1, 0])
- y_logits = tf.transpose(tf.stack(logits, axis=0), [1, 0, 2])
- return y_pred_tokens, y_logits
- tf.reset_default_graph()
- vocab_size = 100
- hidden_dim = 100
- inputs = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32) # bs x seq_len
- mask = tf.cast(tf.random_uniform(shape=[32, 10]) * 2, tf.int32) # bs x seq_len
- inputs_len = tf.reduce_sum(mask, axis=1)
- embedding_matrix = tf.random_uniform(shape=[vocab_size, hidden_dim])
- teacher_forcing_rate = tf.random_uniform(shape=())
- y = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32)
- encoder_outputs, encoder_state = encoder(inputs, inputs_len, embedding_matrix, hidden_dim)
- decoder(encoder_outputs, encoder_state, embedding_matrix, mask, hidden_dim, max_length=10,
- y_ph=y, teacher_forcing_rate_ph=teacher_forcing_rate)
- @register('seq2seq')
- class Seq2Seq(TFModel):
- def __init__(self, **kwargs):
- # hyperparameters
- # dimension of word embeddings
- self.embeddings_dim = kwargs.get('embeddings_dim', 100)
- # size of recurrent cell in encoder and decoder
- self.cell_size = kwargs.get('cell_size', 200)
- # dropout keep_probability
- self.keep_prob = kwargs.get('keep_prob', 0.8)
- # learning rate
- self.learning_rate = kwargs.get('learning_rate', 3e-04)
- # max length of output sequence
- self.max_length = kwargs.get('max_length', 20)
- self.grad_clip = kwargs.get('grad_clip', 5.0)
- self.start_token_id = kwargs.get('start_token_id', 1)
- self.vocab_size = kwargs.get('vocab_size', 11595)
- self.teacher_forcing_rate = kwargs.get('teacher_forcing_rate', 0.0)
- self.use_attention = kwargs.get('use_attention', False)
- # create tensorflow session to run computational graph in it
- self.sess_config = tf.ConfigProto(allow_soft_placement=True)
- self.sess_config.gpu_options.allow_growth = True
- self.sess = tf.Session(config=self.sess_config)
- self.init_graph()
- # define train op
- self.train_op = self.get_train_op(self.loss, self.lr_ph,
- optimizer=tf.train.AdamOptimizer,
- clip_norm=self.grad_clip)
- # initialize graph variables
- self.sess.run(tf.global_variables_initializer())
- super().__init__(**kwargs)
- # load saved model if there is one
- if self.load_path is not None:
- self.load()
- def init_graph(self):
- # create placeholders
- self.init_placeholders()
- self.x_mask = tf.cast(self.x_ph, tf.int32)
- self.y_mask = tf.cast(self.y_ph, tf.int32)
- self.x_len = tf.reduce_sum(self.x_mask, axis=1)
- # create embeddings matrix for tokens
- self.embeddings = tf.Variable(
- tf.random_uniform((self.vocab_size, self.embeddings_dim), -0.1, 0.1, name='embeddings'), dtype=tf.float32)
- # encoder
- encoder_outputs, encoder_state = encoder(self.x_ph, self.x_len, self.embeddings, self.cell_size,
- self.keep_prob_ph)
- # decoder
- self.y_pred_tokens, y_logits = decoder(encoder_outputs, encoder_state, self.embeddings, self.x_mask,
- self.cell_size, self.max_length,
- self.y_ph, self.start_token_id, self.keep_prob_ph,
- self.teacher_forcing_rate_ph, self.use_attention, self.is_train_ph)
- # loss
- self.y_ohe = tf.one_hot(self.y_ph, depth=self.vocab_size)
- self.y_mask = tf.cast(self.y_mask, tf.float32)
- self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y_ohe, logits=y_logits) * self.y_mask
- self.loss = tf.reduce_sum(self.loss) / tf.reduce_sum(self.y_mask)
- def init_placeholders(self):
- # placeholders for inputs
- self.x_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='x_ph')
- # at inference time y_ph is used (y_ph exists in computational graph) when teacher forcing is activated, so we add dummy default value
- # this dummy value is not actually used at inference
- self.y_ph = tf.placeholder_with_default(tf.zeros_like(self.x_ph), shape=(None, None), name='y_ph')
- # placeholders for model parameters
- self.lr_ph = tf.placeholder(dtype=tf.float32, shape=[], name='lr_ph')
- self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph')
- self.is_train_ph = tf.placeholder_with_default(False, shape=[], name='is_train_ph')
- self.teacher_forcing_rate_ph = tf.placeholder_with_default(0.0, shape=[], name='teacher_forcing_rate_ph')
- def _build_feed_dict(self, x, y=None):
- feed_dict = {
- self.x_ph: x,
- }
- if y is not None:
- feed_dict.update({
- self.y_ph: y,
- self.lr_ph: self.learning_rate,
- self.keep_prob_ph: self.keep_prob,
- self.is_train_ph: True,
- self.teacher_forcing_rate_ph: self.teacher_forcing_rate,
- })
- return feed_dict
- def train_on_batch(self, x, y):
- feed_dict = self._build_feed_dict(x, y)
- loss, _ = self.sess.run([self.loss, self.train_op], feed_dict=feed_dict)
- return loss
- def __call__(self, x):
- feed_dict = self._build_feed_dict(x)
- y_pred = self.sess.run(self.y_pred_tokens, feed_dict=feed_dict)
- return y_pred
- s2s = Seq2Seq(
- save_path='./save/seq2seq_model',
- load_path='./save/seq2seq_model'
- )
- vocab(s2s(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']]))))
- def softmax_mask(values, mask):
- # adds big negative to masked values
- INF = 1e30
- return -INF * (1 - tf.cast(mask, tf.float32)) + values
- def dot_attention(memory, state, mask, scope="dot_attention"):
- # inputs: bs x seq_len x hidden_dim
- # state: bs x hidden_dim
- # mask: bs x seq_len
- with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
- # dot product between each item in memory and state
- logits = tf.matmul(memory, tf.expand_dims(state, axis=1), transpose_b=True)
- logits = tf.squeeze(logits, [2])
- # apply mask to logits
- logits = softmax_mask(logits, mask)
- # apply softmax to logits
- att_weights = tf.expand_dims(tf.nn.softmax(logits), axis=2)
- # compute weighted sum of items in memory
- att = tf.reduce_sum(att_weights * memory, axis=1)
- return att
- tf.reset_default_graph()
- memory = tf.random_normal(shape=[32, 10, 100]) # bs x seq_len x hidden_dim
- state = tf.random_normal(shape=[32, 100]) # bs x hidden_dim
- mask = tf.cast(tf.random_normal(shape=[32, 10]), tf.int32) # bs x seq_len
- dot_attention(memory, state, mask)
- @register('postprocessing')
- class SentencePostprocessor(Component):
- def __init__(self, pad_token='<PAD>', start_token='<BOS>', end_token='<EOS>', *args, **kwargs):
- self.pad_token = pad_token
- self.start_token = start_token
- self.end_token = end_token
- def __call__(self, batch):
- for i in range(len(batch)):
- batch[i] = ' '.join(self._postproc(batch[i]))
- return batch
- def _postproc(self, utt):
- if self.end_token in utt:
- utt = utt[:utt.index(self.end_token)]
- return utt
- postprocess = SentencePostprocessor()
- postprocess(vocab(s2s(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']])))))
- config = {
- "dataset_reader": {
- "class_name": "personachat_dataset_reader",
- "data_path": "./personachat"
- },
- "dataset_iterator": {
- "class_name": "personachat_iterator",
- "seed": 1337,
- "shuffle": True
- },
- "chainer": {
- "in": ["x"],
- "in_y": ["y"],
- "pipe": [
- {
- "class_name": "lazy_tokenizer",
- "id": "tokenizer",
- "in": ["x"],
- "out": ["x_tokens"]
- },
- {
- "class_name": "lazy_tokenizer",
- "id": "tokenizer",
- "in": ["y"],
- "out": ["y_tokens"]
- },
- {
- "class_name": "dialog_vocab",
- "id": "vocab",
- "save_path": "./vocab.dict",
- "load_path": "./vocab.dict",
- "min_freq": 2,
- "special_tokens": ["<PAD>", "<BOS>", "<EOS>", "<UNK>"],
- "unk_token": "<UNK>",
- "fit_on": ["x_tokens", "y_tokens"],
- "in": ["x_tokens"],
- "out": ["x_tokens_ids"]
- },
- {
- "ref": "vocab",
- "in": ["y_tokens"],
- "out": ["y_tokens_ids"]
- },
- {
- "class_name": "sentence_padder",
- "id": "padder",
- "length_limit": 20,
- "in": ["x_tokens_ids"],
- "out": ["x_tokens_ids"]
- },
- {
- "ref": "padder",
- "in": ["y_tokens_ids"],
- "out": ["y_tokens_ids"]
- },
- {
- "class_name": "seq2seq",
- "id": "s2s",
- "max_length": "#padder.length_limit+2",
- "cell_size": 250,
- "embeddings_dim": 50,
- "vocab_size": 11595,
- "keep_prob": 0.8,
- "learning_rate": 3e-04,
- "teacher_forcing_rate": 0.0,
- "use_attention": False,
- "save_path": "./save/seq2seq_model",
- "load_path": "./save/seq2seq_model",
- "in": ["x_tokens_ids"],
- "in_y": ["y_tokens_ids"],
- "out": ["y_predicted_tokens_ids"],
- },
- {
- "ref": "vocab",
- "in": ["y_predicted_tokens_ids"],
- "out": ["y_predicted_tokens"]
- },
- {
- "class_name": "postprocessing",
- "in": ["y_predicted_tokens"],
- "out": ["y_predicted_tokens"]
- }
- ],
- "out": ["y_predicted_tokens"]
- },
- "train": {
- "log_every_n_batches": 100,
- "val_every_n_epochs": 0,
- "batch_size": 64,
- "validation_patience": 0,
- "epochs": 20,
- "metrics": ["bleu"],
- }
- }
- model = build_model(config)
- model(['Hi, how are you?', 'Any ideas my dear friend?'])
- train_evaluate_model_from_config(config=config)
- model = build_model(config)
- model(['hi, how are you?', 'any ideas my dear friend?', 'okay, i agree with you', 'good bye!'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement