Advertisement
Guest User

Untitled

a guest
May 20th, 2019
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.28 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. import numpy as np
  3. import tensorflow as tf
  4.  
  5. import decoder
  6. from morpho_dataset import MorphoDataset
  7.  
  8. class Network:
  9. def __init__(self, args, num_source_chars, num_target_chars):
  10. class Model(tf.keras.Model):
  11. def __init__(self):
  12. super().__init__()
  13.  
  14. # TODO(lemmatizer_noattn): Define
  15. # - source_embeddings as a masked embedding layer of source chars into args.cle_dim dimensions
  16.  
  17. self.source_embeddings = tf.keras.layers.Embedding(
  18. input_dim = num_source_chars,
  19. output_dim = args.cle_dim,
  20. mask_zero = True
  21. )
  22.  
  23. # TODO: Define
  24. # - source_rnn as a bidirectional GRU with args.rnn_dim units, returning _whole sequences_, summing opposite directions
  25.  
  26. self.source_rnn = tf.keras.layers.Bidirectional(
  27. tf.keras.layers.GRU(
  28. args.rnn_cell_dim,
  29. return_sequences=True
  30. )
  31. )
  32.  
  33. # TODO(lemmatizer_noattn): Define
  34. # - target_embedding as an unmasked embedding layer of target chars into args.cle_dim dimensions
  35. # - target_rnn_cell as a GRUCell with args.rnn_dim units
  36. # - target_output_layer as a Dense layer into `num_target_chars`
  37. self.target_embedding = tf.keras.layers.Embedding(
  38. output_dim = args.cle_dim,
  39. mask_zero = False
  40. )
  41. self.target_rnn_cell = tf.keras.layers.GRU(
  42. args.rnn_cell_dim
  43. )
  44. self.target_output_layer = tf.keras.layers.Dense(
  45. num_target_chars
  46. )
  47.  
  48. # TODO: Define
  49. # - attention_source_layer as a Dense layer with args.rnn_dim outputs
  50. # - attention_state_layer as a Dense layer with args.rnn_dim outputs
  51. # - attention_weight_layer as a Dense layer with 1 output
  52. self.attention_source_layer = tf.keras.layers.Dense(
  53. args.rnn_dim
  54. )
  55. self.attention_state_layer = tf.keras.layers.Dense(
  56. args.rnn_dim
  57. )
  58. self.attention_weight_layer = tf.keras.layers.Dense(
  59. 1
  60. )
  61.  
  62. self._model = Model()
  63.  
  64. self._optimizer = tf.optimizers.Adam()
  65. # TODO(lemmatizer_noattn): Define self._loss as SparseCategoricalCrossentropy which processes _logits_ instead of probabilities
  66.  
  67. self._metrics_training = {"loss": tf.metrics.Mean(), "accuracy": tf.metrics.SparseCategoricalAccuracy()}
  68. self._metrics_evaluation = {"accuracy": tf.metrics.Mean()}
  69. self._writer = tf.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)
  70.  
  71. def _append_eow(self, sequences):
  72. """Append EOW character after end every given sequence."""
  73. sequences_rev = tf.reverse_sequence(sequences, tf.reduce_sum(tf.cast(tf.not_equal(sequences, 0), tf.int32), axis=1), 1)
  74. sequences_rev_eow = tf.pad(sequences_rev, [[0, 0], [1, 0]], constant_values=MorphoDataset.Factor.EOW)
  75. return tf.reverse_sequence(sequences_rev_eow, tf.reduce_sum(tf.cast(tf.not_equal(sequences_rev_eow, 0), tf.int32), axis=1), 1)
  76.  
  77. @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
  78. def train_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
  79. # TODO(lemmatizer_noattn): Modify target_charseqs by appending EOW; only the version with appended EOW is used from now on.
  80.  
  81. with tf.GradientTape() as tape:
  82. # TODO(lemmatizer_noattn): Embed source charseqs
  83. # TODO: Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
  84.  
  85. # Copy the source_encoded to corresponding batch places, and then flatten it
  86. source_mask = tf.not_equal(source_charseq_ids, 0)
  87. source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
  88. targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), source_mask)
  89.  
  90. class DecoderTraining(decoder.BaseDecoder):
  91. @property
  92. def batch_size(self): raise NotImplemented() # TODO: Return the batch size of self._source_encoded, using tf.shape
  93. @property
  94. def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the number logits per each output
  95. @property
  96. def output_dtype(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return the type of the logits
  97.  
  98. def _with_attention(self, inputs, states):
  99. # TODO: Compute the attention.
  100. # - Take self._source_encoded and pass it through the self._model.attention_source_layer.
  101. # Because self._source_encoded does not change, you should in fact do it in `initialize`.
  102. # - Pass `states` though self._model.attention_state_layer.
  103. # - Sum the two outputs. However, the first has shape [a, b, c] and the second [a, c]. Therefore,
  104. # somehow expand the second to [a, b, c] first. (Hint: use broadcasting rules.)
  105. # - Pass the sum through `tf.tanh`, then self._model.attention_weight_layer.
  106. # - Then, run softmax on a suitable axis (the one corresponding to characters), generating `weights`.
  107. # - Multiply `self._source_encoded` with `weights` and sum the result in the axis
  108. # corresponding to characters, generating `attention`. Therefore, `attention` is a a fixed-size
  109. # representation for every batch element, independently on how many characters had
  110. # the corresponding input forms.
  111. # - Finally concatenate `inputs` and `attention` and return the result.
  112.  
  113. def initialize(self, layer_inputs, initial_state=None):
  114. self._model, self._source_encoded, self._targets = layer_inputs
  115.  
  116. # TODO(lemmatozer_noattn): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
  117. # TODO(lemmatizer_noattn): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
  118. # embedded using self._model.target_embedding
  119. # TODO: Define `states` as the last words from self._source_encoded
  120. # TODO: Pass `inputs` through `self._with_attention(inputs, states)`.
  121. return finished, inputs, states
  122.  
  123. def step(self, time, inputs, states):
  124. # TODO(lemmatizer_noattn): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
  125. # `outputs, [states]`.
  126. # TODO(lemmatizer_noattn): Overwrite `outputs` by passing them through self._model.target_output_layer,
  127. # TODO(lemmatizer_noattn): Define `next_inputs` by embedding `time`-th words from `self._targets`.
  128. # TODO(lemmatizer_noattn): Define `finished` as True if `time`-th word from `self._targets` is EOW, False otherwise.
  129. # Again, no == or !=.
  130. # TODO: Pass `next_inputs` through `self._with_attention(inputs, states)`.
  131. return outputs, states, next_inputs, finished
  132.  
  133. output_layer, _, _ = DecoderTraining()([self._model, source_encoded, targets])
  134. # TODO(lemmatizer_noattn): Compute loss. Use only nonzero `targets` as a mask.
  135. gradients = tape.gradient(loss, self._model.variables)
  136. self._optimizer.apply_gradients(zip(gradients, self._model.variables))
  137.  
  138. tf.summary.experimental.set_step(self._optimizer.iterations)
  139. with self._writer.as_default():
  140. for name, metric in self._metrics_training.items():
  141. metric.reset_states()
  142. if name == "loss": metric(loss)
  143. else: metric(targets, output_layer, tf.not_equal(targets, 0))
  144. tf.summary.scalar("train/{}".format(name), metric.result())
  145.  
  146. return tf.math.argmax(output_layer, axis=2)
  147.  
  148. def train_epoch(self, dataset, args):
  149. for batch in dataset.batches(args.batch_size):
  150. # TODO(lemmatizer_noattn): Call train_batch, storing results in `predictions`.
  151.  
  152. form, gold_lemma, system_lemma = "", "", ""
  153. for i in batch[dataset.FORMS].charseqs[1]:
  154. if i: form += dataset.data[dataset.FORMS].alphabet[i]
  155. for i in range(len(batch[dataset.LEMMAS].charseqs[1])):
  156. if batch[dataset.LEMMAS].charseqs[1][i]:
  157. gold_lemma += dataset.data[dataset.LEMMAS].alphabet[batch[dataset.LEMMAS].charseqs[1][i]]
  158. system_lemma += dataset.data[dataset.LEMMAS].alphabet[predictions[0][i]]
  159. print(float(self._metrics_training["accuracy"].result()), form, gold_lemma, system_lemma)
  160.  
  161.  
  162. @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 2, autograph=False)
  163. def predict_batch(self, source_charseq_ids, source_charseqs):
  164. # TODO(lemmatizer_noattn)(train_batch): Embed source charseqs
  165. # TODO(train_batch): Run self._model.source_rnn on the embedded sequences, returning outputs in `source_encoded`.
  166.  
  167. # Copy the source_encoded to corresponding batch places, and then flatten it
  168. source_mask = tf.not_equal(source_charseq_ids, 0)
  169. source_encoded = tf.boolean_mask(tf.gather(source_encoded, source_charseq_ids), source_mask)
  170.  
  171. class DecoderPrediction(decoder.BaseDecoder):
  172. @property
  173. def batch_size(self): raise NotImplemented() # TODO(lemmatizer_noattn)(train_batch): Return the batch size of self._source_encoded, using tf.shape
  174. @property
  175. def output_size(self): raise NotImplemented() # TODO(lemmatizer_noattn): Return 1 because we are returning directly the predictions
  176. @property
  177. def output_dtype(self): return NotImplemented() # TODO(lemmatizer_noattn): Return tf.int32 because the predictions are integral
  178.  
  179. def _with_attention(self, inputs, states):
  180. # TODO: A copy of _with_attention from train_batch; you can of course
  181. # move the definition to a place where it can be reused in both places.
  182.  
  183. def initialize(self, layer_inputs, initial_state=None):
  184. self._model, self._source_encoded = layer_inputs
  185.  
  186. # TODO(lemmatizer_noattn)(train_batch): Define `finished` as a vector of self.batch_size of `False` [see tf.fill].
  187. # TODO(lemmatizer_noattn)(train_batch): Define `inputs` as a vector of self.batch_size of MorphoDataset.Factor.BOW [see tf.fill],
  188. # embedded using self._model.target_embedding
  189. # TODO(train_batch): Define `states` as the last words from self._source_encoded
  190. # TODO(train_batch): Pass `inputs` through `self._with_attention(inputs, states)`.
  191. return finished, inputs, states
  192.  
  193. def step(self, time, inputs, states):
  194. # TODO(lemmatizer_noattn)(train_batch): Pass `inputs` and `[states]` through self._model.target_rnn_cell, generating
  195. # `outputs, [states]`.
  196. # TODO(lemmatizer_noattn)(train_batch): Overwrite `outputs` by passing them through self._model.target_output_layer,
  197. # TODO(lemmatizer_noattn): Overwirte `outputs` by passing them through `tf.argmax` on suitable axis and with
  198. # `output_type=tf.int32` parameter.
  199. # TODO(lemmatizer_noattn): Define `next_inputs` by embedding the `outputs`
  200. # TODO(lemmatizer_noattn): Define `finished` as True if `outputs` are EOW, False otherwise. [No == or !=].
  201. # TODO(train_batch): Pass `next_inputs` through `self._with_attention(inputs, states)`.
  202. return outputs, states, next_inputs, finished
  203.  
  204. predictions, _, _ = DecoderPrediction(maximum_iterations=tf.shape(source_charseqs)[1] + 10)([self._model, source_encoded])
  205. return predictions
  206.  
  207. @tf.function(input_signature=[tf.TensorSpec(shape=[None, None], dtype=tf.int32)] * 4, autograph=False)
  208. def evaluate_batch(self, source_charseq_ids, source_charseqs, target_charseq_ids, target_charseqs):
  209. # Predict
  210. predictions = self.predict_batch(source_charseq_ids, source_charseqs)
  211.  
  212. # Append EOW to target_charseqs and copy them to corresponding places and flatten it
  213. target_charseqs = self._append_eow(target_charseqs)
  214. targets = tf.boolean_mask(tf.gather(target_charseqs, target_charseq_ids), tf.not_equal(source_charseq_ids, 0))
  215.  
  216. # Compute accuracy, but on the whole sequences
  217. mask = tf.cast(tf.not_equal(targets, 0), tf.int32)
  218. resized_predictions = tf.concat([predictions, tf.zeros_like(targets)], axis=1)[:, :tf.shape(targets)[1]]
  219. equals = tf.reduce_all(tf.equal(resized_predictions * mask, targets * mask), axis=1)
  220. self._metrics_evaluation["accuracy"](equals)
  221.  
  222. def evaluate(self, dataset, dataset_name, args):
  223. for metric in self._metrics_evaluation.values():
  224. metric.reset_states()
  225. for batch in dataset.batches(args.batch_size):
  226. predictions = self.evaluate_batch(batch[dataset.FORMS].charseq_ids, batch[dataset.FORMS].charseqs,
  227. batch[dataset.LEMMAS].charseq_ids, batch[dataset.LEMMAS].charseqs)
  228.  
  229. metrics = {name: float(metric.result()) for name, metric in self._metrics_evaluation.items()}
  230. with self._writer.as_default():
  231. for name, value in metrics.items():
  232. tf.summary.scalar("{}/{}".format(dataset_name, name), value)
  233.  
  234. return metrics
  235.  
  236.  
  237. if __name__ == "__main__":
  238. import argparse
  239. import datetime
  240. import os
  241. import re
  242.  
  243. # Parse arguments
  244. parser = argparse.ArgumentParser()
  245. parser.add_argument("--batch_size", default=10, type=int, help="Batch size.")
  246. parser.add_argument("--cle_dim", default=64, type=int, help="CLE embedding dimension.")
  247. parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.")
  248. parser.add_argument("--max_sentences", default=5000, type=int, help="Maximum number of sentences to load.")
  249. parser.add_argument("--recodex", default=False, action="store_true", help="Evaluation in ReCodEx.")
  250. parser.add_argument("--rnn_dim", default=64, type=int, help="RNN cell dimension.")
  251. parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
  252. args = parser.parse_args()
  253.  
  254. # Fix random seeds and number of threads
  255. np.random.seed(42)
  256. tf.random.set_seed(42)
  257. if args.recodex:
  258. tf.keras.utils.get_custom_objects()["glorot_uniform"] = lambda: tf.initializers.glorot_uniform(seed=42)
  259. tf.keras.utils.get_custom_objects()["orthogonal"] = lambda: tf.initializers.orthogonal(seed=42)
  260. tf.keras.utils.get_custom_objects()["uniform"] = lambda: tf.initializers.RandomUniform(seed=42)
  261. tf.config.threading.set_inter_op_parallelism_threads(args.threads)
  262. tf.config.threading.set_intra_op_parallelism_threads(args.threads)
  263.  
  264. # Create logdir name
  265. args.logdir = os.path.join("logs", "{}-{}-{}".format(
  266. os.path.basename(__file__),
  267. datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
  268. ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
  269. ))
  270.  
  271. # Load the data
  272. morpho = MorphoDataset("czech_cac", max_sentences=args.max_sentences)
  273.  
  274. # Create the network and train
  275. network = Network(args,
  276. num_source_chars=len(morpho.train.data[morpho.train.FORMS].alphabet),
  277. num_target_chars=len(morpho.train.data[morpho.train.LEMMAS].alphabet))
  278. for epoch in range(args.epochs):
  279. network.train_epoch(morpho.train, args)
  280. metrics = network.evaluate(morpho.dev, "dev", args)
  281. print("Evaluation on {}, epoch {}: {}".format("dev", epoch + 1, metrics))
  282.  
  283. metrics = network.evaluate(morpho.test, "test", args)
  284. with open("lemmatizer.out", "w") as out_file:
  285. print("{:.2f}".format(100 * metrics["accuracy"]), file=out_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement