Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import numpy as np
- import tensorflow as tf
- import decoder
- from morpho_dataset import MorphoDataset
- class Network:
- def __init__(self, args, num_source_chars, num_target_chars):
- class Model(tf.keras.Model):
- def __init__(self):
- super().__init__()
- # TODO(lemmatizer_noattn): Define
- # - source_embeddings as a masked embedding layer of source chars into args.cle_dim dimensions
- self.source_embeddings = tf.keras.layers.Embedding(
- input_dim = num_source_chars,
- output_dim = args.cle_dim,
- mask_zero = True
- )
- # TODO: Define
- # - source_rnn as a bidirectional GRU with args.rnn_dim units, returning _whole sequences_, summing opposite directions
- self.source_rnn = tf.keras.layers.Bidirectional(
- tf.keras.layers.GRU(
- args.rnn_cell_dim,
- return_sequences=True
- )
- )
- # TODO(lemmatizer_noattn): Define
- # - target_embedding as an unmasked embedding layer of target chars into args.cle_dim dimensions
- # - target_rnn_cell as a GRUCell with args.rnn_dim units
- # - target_output_layer as a Dense layer into `num_target_chars`
- self.target_embedding = tf.keras.layers.Embedding(
- output_dim = args.cle_dim,
- mask_zero = False
- )
- self.target_rnn_cell = tf.keras.layers.GRU(
- args.rnn_cell_dim
- )
- self.target_output_layer = tf.keras.layers.Dense(
- num_target_chars
- )
- # TODO: Define
- # - attention_source_layer as a Dense layer with args.rnn_dim outputs
- # - attention_state_layer as a Dense layer with args.rnn_dim outputs
- # - attention_weight_layer as a Dense layer with 1 output
- self.attention_source_layer = tf.keras.layers.Dense(
- args.rnn_dim
- )
- self.attention_state_layer = tf.keras.layers.Dense(
- args.rnn_dim
- )
- self.attention_weight_layer = tf.keras.layers.Dense(
- 1
- )
- self._model = Model()
- self._optimizer = tf.optimizers.Adam()
- # TODO(lemmatizer_noattn): Define self._loss as SparseCategoricalCrossentropy which processes _logits_ instead of probabilities
- self._metrics_training = {"loss": tf.metrics.Mean(), "accuracy": tf.metrics.SparseCategoricalAccuracy()}
- self._metrics_evaluation = {"accuracy": tf.metrics.Mean()}
- self._writer = tf.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)
- def _append_eow(self, sequences):
- """Append EOW character after end every given sequence."""
- sequences_rev = tf.reverse_sequence(sequences, tf.reduce_sum(tf.cast(tf.not_equal(sequences, 0), tf.int32), axis=1), 1)
- sequences_rev_eow = tf.pad(sequences_rev, [[0, 0], [1, 0]], constant_values=MorphoDataset.Factor.EOW)
- return tf.reverse_sequence(sequences_rev_eow, tf.reduce_sum(tf.cast(tf.not_equal(sequences_rev_eow, 0), tf.int32), axis=1), 1)
- @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
- def train_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
- # TODO(lemmatizer_noattn): Modify target_charseqs by appending EOW; only the version with appended EOW is used from now on.
- with tf.GradientTape() as tape:
- # TODO(lemmatizer_noattn): Embed source charseqs
- # TODO: Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
- # Copy the source_encoded to corresponding batch places, and then flatten it
- source_mask = tf.not_equal(source_charseq_ids, 0)
- source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
- targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), source_mask)
- class DecoderTraining(decoder.BaseDecoder):
- @property
- def batch_size(self): raise NotImplemented() # TODO: Return the batch size of self._source_encoded, using tf.shape
- @property
- def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the number logits per each output
- @property
- def output_dtype(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the type of the logits
- def _with_attention(self, inputs, states):
- # TODO: Compute the attention.
- # - Take self._source_encoded and pass it through the self._model.attention_source_layer.
- # Because self._source_encoded does not change, you should in fact do it in `initialize`.
- # - Pass `states` though self._model.attention_state_layer.
- # - Sum the two outputs. However, the first has shape [a, b, c] and the second [a, c]. Therefore,
- # somehow expand the second to [a, b, c] first. (Hint: use broadcasting rules.)
- # - Pass the sum through `tf.tanh`, then self._model.attention_weight_layer.
- # - Then, run softmax on a suitable axis (the one corresponding to characters), generating `weights`.
- # - Multiply `self._source_encoded` with `weights` and sum the result in the axis
- # corresponding to characters, generating `attention`. Therefore, `attention` is a a fixed-size
- # representation for every batch element, independently on how many characters had
- # the corresponding input forms.
- # - Finally concatenate `inputs` and `attention` and return the result.
- def initialize(self, layer_inputs, initial_state=None):
- self._model, self._source_encoded, self._targets = layer_inputs
- # TODO(lemmatozer_noattn): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
- # TODO(lemmatizer_noattn): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
- # embedded using self._model.target_embedding
- # TODO: Define `states` as the last words from self._source_encoded
- # TODO: Pass `inputs` through `self._with_attention(inputs, states)`.
- return finished, inputs, states
- def step(self, time, inputs, states):
- # TODO(lemmatizer_noattn): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
- # `outputs, [states]`.
- # TODO(lemmatizer_noattn): Overwrite `outputs` by passing them through self._model.target_output_layer,
- # TODO(lemmatizer_noattn): Define `next_inputs` by embedding `time`-th words from `self._targets`.
- # TODO(lemmatizer_noattn): Define `finished` as True if `time`-th word from `self._targets` is EOW, False otherwise.
- # Again, no == or !=.
- # TODO: Pass `next_inputs` through `self._with_attention(inputs, states)`.
- return outputs, states, next_inputs, finished
- output_layer, _, _ = DecoderTraining()([self._model, source_encoded, targets])
- # TODO(lemmatizer_noattn): Compute loss. Use only nonzero `targets` as a mask.
- gradients = tape.gradient(loss, self._model.variables)
- self._optimizer.apply_gradients(zip(gradients, self._model.variables))
- tf.summary.experimental.set_step(self._optimizer.iterations)
- with self._writer.as_default():
- for name, metric in self._metrics_training.items():
- metric.reset_states()
- if name == "loss": metric(loss)
- else: metric(targets, output_layer, tf.not_equal(targets, 0))
- tf.summary.scalar("train/{}".format(name), metric.result())
- return tf.math.argmax(output_layer, axis=2)
- def train_epoch(self, dataset, args):
- for batch in dataset.batches(args.batch_size):
- # TODO(lemmatizer_noattn): Call train_batch, storing results in `predictions`.
- form, gold_lemma, system_lemma = "", "", ""
- for i in batch[dataset.FORMS].charseqs[1]:
- if i: form += dataset.data[dataset.FORMS].alphabet[i]
- for i in range(len(batch[dataset.LEMMAS].charseqs[1])):
- if batch[dataset.LEMMAS].charseqs[1][i]:
- gold_lemma += dataset.data[dataset.LEMMAS].alphabet[batch[dataset.LEMMAS].charseqs[1][i]]
- system_lemma += dataset.data[dataset.LEMMAS].alphabet[predictions[0][i]]
- print(float(self._metrics_training["accuracy"].result()), form, gold_lemma, system_lemma)
- @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 2, autograph=False)
- def predict_batch(self, source_charseq_ids, source_charseqs):
- # TODO(lemmatizer_noattn)(train_batch): Embed source charseqs
- # TODO(train_batch): Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
- # Copy the source_encoded to corresponding batch places, and then flatten it
- source_mask = tf.not_equal(source_charseq_ids, 0)
- source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
- class DecoderPrediction(decoder.BaseDecoder):
- @property
- def batch_size(self): raise NotImplemented() # TODO(lemmatizer_noattn)(train_batch): Return the batch size of self._source_encoded, using tf.shape
- @property
- def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return 1 because we are returning directly the predictions
- @property
- def output_dtype(self): return NotImplemented() # TODO(lemmatizer_noattn): Return tf.int32 because the predictions are integral
- def _with_attention(self, inputs, states):
- # TODO: A copy of _with_attention from train_batch; you can of course
- # move the definition to a place where it can be reused in both places.
- def initialize(self, layer_inputs, initial_state=None):
- self._model, self._source_encoded = layer_inputs
- # TODO(lemmatizer_noattn)(train_batch): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
- # TODO(lemmatizer_noattn)(train_batch): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
- # embedded using self._model.target_embedding
- # TODO(train_batch): Define `states` as the last words from self._source_encoded
- # TODO(train_batch): Pass `inputs` through `self._with_attention(inputs, states)`.
- return finished, inputs, states
- def step(self, time, inputs, states):
- # TODO(lemmatizer_noattn)(train_batch): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
- # `outputs, [states]`.
- # TODO(lemmatizer_noattn)(train_batch): Overwrite `outputs` by passing them through self._model.target_output_layer,
- # TODO(lemmatizer_noattn): Overwirte `outputs` by passing them through `tf.argmax` on suitable axis and with
- # `output_type=tf.int32` parameter.
- # TODO(lemmatizer_noattn): Define `next_inputs` by embedding the `outputs`
- # TODO(lemmatizer_noattn): Define `finished` as True if `outputs` are EOW, False otherwise. [No == or !=].
- # TODO(train_batch): Pass `next_inputs` through `self._with_attention(inputs, states)`.
- return outputs, states, next_inputs, finished
- predictions, _, _ = DecoderPrediction(maximum_iterations=tf.shape(source_charseqs)[1] + 10)([self._model, source_encoded])
- return predictions
- @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
- def evaluate_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
- # Predict
- predictions = self.predict_batch(source_charseq_ids, source_charseqs)
- # Append EOW to target_charseqs and copy them to corresponding places and flatten it
- target_charseqs = self._append_eow(target_charseqs)
- targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), tf.not_equal(source_charseq_ids, 0))
- # Compute accuracy, but on the whole sequences
- mask = tf.cast(tf.not_equal(targets, 0), tf.int32)
- resized_predictions = tf.concat([predictions, tf.zeros_like(targets)], axis=1)[:, :tf.shape(targets)[1]]
- equals = tf.reduce_all(tf.equal(resized_predictions * mask, targets * mask), axis=1)
- self._metrics_evaluation["accuracy"](equals)
- def evaluate(self, dataset, dataset_name, args):
- for metric in self._metrics_evaluation.values():
- metric.reset_states()
- for batch in dataset.batches(args.batch_size):
- predictions = self.evaluate_batch(batch[dataset.FORMS].charseq_ids, batch[dataset.FORMS].charseqs,
- batch[dataset.LEMMAS].charseq_ids, batch[dataset.LEMMAS].charseqs)
- metrics = {name: float(metric.result()) for name, metric in self._metrics_evaluation.items()}
- with self._writer.as_default():
- for name, value in metrics.items():
- tf.summary.scalar("{}/{}".format(dataset_name, name), value)
- return metrics
- if __name__ == "__main__":
- import argparse
- import datetime
- import os
- import re
- # Parse arguments
- parser = argparse.ArgumentParser()
- parser.add_argument("--batch_size", default=10, type=int, help="Batch size.")
- parser.add_argument("--cle_dim", default=64, type=int, help="CLE embedding dimension.")
- parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
- parser.add_argument("--max_sentences", default=5000, type=int, help="Maximum number of sentences to load.")
- parser.add_argument("--recodex", default=False, action="store_true", help="Evaluation in ReCodEx.")
- parser.add_argument("--rnn_dim", default=64, type=int, help="RNN cell dimension.")
- parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
- args = parser.parse_args()
- # Fix random seeds and number of threads
- np.random.seed(42)
- tf.random.set_seed(42)
- if args.recodex:
- tf.keras.utils.get_custom_objects()["glorot_uniform"] = lambda: tf.initializers.glorot_uniform(seed=42)
- tf.keras.utils.get_custom_objects()["orthogonal"] = lambda: tf.initializers.orthogonal(seed=42)
- tf.keras.utils.get_custom_objects()["uniform"] = lambda: tf.initializers.RandomUniform(seed=42)
- tf.config.threading.set_inter_op_parallelism_threads(args.threads)
- tf.config.threading.set_intra_op_parallelism_threads(args.threads)
- # Create logdir name
- args.logdir = os.path.join("logs", "{}-{}-{}".format(
- os.path.basename(__file__),
- datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
- ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
- ))
- # Load the data
- morpho = MorphoDataset("czech_cac", max_sentences=args.max_sentences)
- # Create the network and train
- network = Network(args,
- num_source_chars=len(morpho.train.data[morpho.train.FORMS].alphabet),
- num_target_chars=len(morpho.train.data[morpho.train.LEMMAS].alphabet))
- for epoch in range(args.epochs):
- network.train_epoch(morpho.train, args)
- metrics = network.evaluate(morpho.dev, "dev", args)
- print("Evaluation on {}, epoch {}: {}".format("dev", epoch + 1, metrics))
- metrics = network.evaluate(morpho.test, "test", args)
- with open("lemmatizer.out", "w") as out_file:
- print("{:.2f}".format(100 * metrics["accuracy"]), file=out_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement