error.log

"\uFFED""\uFFED"
[1]<stderr>:  eval_features_file: gen_enfr/data_src.valid.tok
[2]<stderr>:  decoding_subword_token_is_spacer: false
[1]<stderr>:  eval_labels_file: gen_enfr/data_tgt.valid.tok
[2]<stderr>:  label_smoothing: 0.1
[1]<stderr>:  source_vocabulary: gen_enfr/bpe_src.vocab
[2]<stderr>:  learning_rate: 1.0
[1]<stderr>:  target_vocabulary: gen_enfr/bpe_tgt.vocab
[2]<stderr>:  length_penalty: 0.6
[1]<stderr>:  train_features_file: gen_enfr/data_src.train.tok
[2]<stderr>:  max_margin_eta: 0.1
[1]<stderr>:  train_labels_file: gen_enfr/data_tgt.train.tok
[2]<stderr>:  maximum_decoding_length: 256
[1]<stderr>:eval:
[2]<stderr>:  num_hypotheses: 1
[1]<stderr>:  batch_size: 32
[2]<stderr>:  optimizer: Adam
[1]<stderr>:  batch_type: examples
[2]<stderr>:  optimizer_params:
[1]<stderr>:  early_stopping:
[2]<stderr>:    beta_1: 0.9
[1]<stderr>:    metric: bleu
[2]<stderr>:    beta_2: 0.998
[1]<stderr>:    min_improvement: 0.01
[2]<stderr>:score:
[1]<stderr>:    steps: 3
[2]<stderr>:  batch_size: 64
[1]<stderr>:  external_evaluators: BLEU
[2]<stderr>:  batch_type: examples
[1]<stderr>:  length_bucket_width: 5
[2]<stderr>:  length_bucket_width: 5
[1]<stderr>:  save_eval_predictions: false
[2]<stderr>:train:
[1]<stderr>:  steps: 5000
[2]<stderr>:  average_last_checkpoints: 8
[1]<stderr>:infer:
[2]<stderr>:  batch_size: 4096
[1]<stderr>:  batch_size: 32
[2]<stderr>:  batch_type: tokens
[1]<stderr>:  batch_type: examples
[2]<stderr>:  effective_batch_size: 25000
[1]<stderr>:  bucket_width: 5
[2]<stderr>:  keep_checkpoint_max: 8
[1]<stderr>:  length_bucket_width: 5
[2]<stderr>:  length_bucket_width: 1
[1]<stderr>:model_dir: gen_enfr/run
[2]<stderr>:  max_step: 200000
[1]<stderr>:params:
[2]<stderr>:  maximum_features_length: 256
[1]<stderr>:  average_loss_in_time: true
[2]<stderr>:  maximum_labels_length: 256
[1]<stderr>:  beam_width: 2
[2]<stderr>:  moving_average_decay: 0.9999
[1]<stderr>:  contrastive_learning: false
[2]<stderr>:  replace_unknown_target: true
[1]<stderr>:  coverage_penalty: 0
[2]<stderr>:  sample_buffer_size: 500000
[1]<stderr>:  decay_params:
[2]<stderr>:  save_checkpoints_steps: 1000
[1]<stderr>:    model_dim: 1024
[2]<stderr>:  save_summary_steps: 200
[1]<stderr>:    warmup_steps: 8000
[2]<stderr>:  single_pass: false
[1]<stderr>:  decay_type: NoamDecay
[2]<stderr>:
[1]<stderr>:  decoding_subword_token: "\uFFED"
[1]<stderr>:  decoding_subword_token_is_spacer: false
[1]<stderr>:  label_smoothing: 0.1
[1]<stderr>:  learning_rate: 1.0
[1]<stderr>:  length_penalty: 0.6
[1]<stderr>:  max_margin_eta: 0.1
[1]<stderr>:  maximum_decoding_length: 256
[1]<stderr>:  num_hypotheses: 1
[1]<stderr>:  optimizer: Adam
[1]<stderr>:  optimizer_params:
[1]<stderr>:    beta_1: 0.9
[1]<stderr>:    beta_2: 0.998
[1]<stderr>:score:
[1]<stderr>:  batch_size: 64
[1]<stderr>:  batch_type: examples
[1]<stderr>:  length_bucket_width: 5
[1]<stderr>:train:
[1]<stderr>:  average_last_checkpoints: 8
[1]<stderr>:  batch_size: 4096
[1]<stderr>:  batch_type: tokens
[1]<stderr>:  effective_batch_size: 25000
[1]<stderr>:  keep_checkpoint_max: 8
[1]<stderr>:  length_bucket_width: 1
[1]<stderr>:  max_step: 200000
[1]<stderr>:  maximum_features_length: 256
[1]<stderr>:  maximum_labels_length: 256
[1]<stderr>:  moving_average_decay: 0.9999
[1]<stderr>:  replace_unknown_target: true
[1]<stderr>:  sample_buffer_size: 500000
[1]<stderr>:  save_checkpoints_steps: 1000
[1]<stderr>:  save_summary_steps: 200
[1]<stderr>:  single_pass: false
[1]<stderr>:
[2]<stderr>:INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
[2]<stderr>:Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
[1]<stderr>:INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
[1]<stderr>:Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
[3]<stderr>:2021-11-24 09:32:41.501168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0, 1, 2, 3
[3]<stderr>:2021-11-24 09:32:41.501227: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
[3]<stderr>:2021-11-24 09:32:41.501238: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 1 2 3
[3]<stderr>:2021-11-24 09:32:41.501244: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N Y Y Y
[3]<stderr>:2021-11-24 09:32:41.501248: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 1:   Y N Y Y
[3]<stderr>:2021-11-24 09:32:41.501252: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 2:   Y Y N Y
[3]<stderr>:2021-11-24 09:32:41.501255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 3:   Y Y Y N
[3]<stderr>:2021-11-24 09:32:41.502881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 21913 MB memory) -> physical GPU (device: 0, name: Quadro RTX 6000, pci bus id: 0000:8e:00.0, compute capability: 7.5)
[3]<stderr>:2021-11-24 09:32:41.503260: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 21913 MB memory) -> physical GPU (device: 1, name: Quadro RTX 6000, pci bus id: 0000:9c:00.0, compute capability: 7.5)
[3]<stderr>:2021-11-24 09:32:41.503595: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 21913 MB memory) -> physical GPU (device: 2, name: Quadro RTX 6000, pci bus id: 0000:ce:00.0, compute capability: 7.5)
[3]<stderr>:2021-11-24 09:32:41.503924: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 21913 MB memory) -> physical GPU (device: 3, name: Quadro RTX 6000, pci bus id: 0000:dc:00.0, compute capability: 7.5)
[3]<stderr>:INFO:tensorflow:Using parameters:
[3]<stderr>:data:
[3]<stderr>:  eval_features_file: gen_enfr/data_src.valid.tok
[3]<stderr>:  eval_labels_file: gen_enfr/data_tgt.valid.tok
[3]<stderr>:  source_vocabulary: gen_enfr/bpe_src.vocab
[3]<stderr>:  target_vocabulary: gen_enfr/bpe_tgt.vocab
[3]<stderr>:  train_features_file: gen_enfr/data_src.train.tok
[3]<stderr>:  train_labels_file: gen_enfr/data_tgt.train.tok
[3]<stderr>:eval:
[3]<stderr>:  batch_size: 32
[3]<stderr>:  batch_type: examples
[3]<stderr>:  early_stopping:
[3]<stderr>:    metric: bleu
[3]<stderr>:    min_improvement: 0.01
[3]<stderr>:    steps: 3
[3]<stderr>:  external_evaluators: BLEU
[3]<stderr>:  length_bucket_width: 5
[3]<stderr>:  save_eval_predictions: false
[3]<stderr>:  steps: 5000
[3]<stderr>:infer:
[3]<stderr>:  batch_size: 32
[3]<stderr>:  batch_type: examples
[3]<stderr>:  bucket_width: 5
[3]<stderr>:  length_bucket_width: 5
[3]<stderr>:model_dir: gen_enfr/run
[3]<stderr>:params:
[3]<stderr>:  average_loss_in_time: true
[3]<stderr>:  beam_width: 2
[3]<stderr>:  contrastive_learning: false
[3]<stderr>:  coverage_penalty: 0
[3]<stderr>:  decay_params:
[3]<stderr>:    model_dim: 1024
[3]<stderr>:    warmup_steps: 8000
[3]<stderr>:  decay_type: NoamDecay
[3]<stderr>:  decoding_subword_token: "\uFFED"'TF_GPU_ALLOCATOR=cuda_malloc_async'"notebooks/pipeline/train.py", line 22, in <module>
[3]<stderr>:    Trainer(**config['train']['trainer']).run()
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
[3]<stderr>:    raise e
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
[3]<stderr>:    function_return_value = f(*args, **kwargs)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
[3]<stderr>:    final_model_dir, train_summary = runner.train(
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 276, in train
[3]<stderr>:    summary = trainer(
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/training.py", line 121, in __call__
[3]<stderr>:    for i, loss in enumerate(
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/training.py", line 262, in _steps
[0]<stderr>:2021-11-24 09:33:01.803181: I tensorflow/stream_executor/cuda/cuda_driver.cc:789] failed to allocate 2.2K (2304 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
[3]<stderr>:    loss = forward_fn()
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 889, in __call__
[3]<stderr>:    result = self._call(*args, **kwds)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 933, in _call
[3]<stderr>:    self._initialize(args, kwds, add_initializers_to=initializers)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 763, in _initialize
[3]<stderr>:    self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3050, in _get_concrete_function_internal_garbage_collected
[3]<stderr>:    graph_function, _ = self._maybe_define_function(args, kwargs)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3444, in _maybe_define_function
[3]<stderr>:    graph_function = self._create_graph_function(args, kwargs)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 3279, in _create_graph_function
[3]<stderr>:    func_graph_module.func_graph_from_py_func(
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 999, in func_graph_from_py_func
[3]<stderr>:    func_outputs = python_func(*func_args, **func_kwargs)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 672, in wrapped_fn
[3]<stderr>:    out = weak_wrapped_fn().__wrapped__(*args, **kwds)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 986, in wrapper
[0]<stderr>:2021-11-24 09:33:01.805553: I tensorflow/stream_executor/cuda/cuda_driver.cc:789] failed to allocate 2.2K (2304 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
[3]<stderr>:    raise e.ag_error_metadata.to_exception(e)
[3]<stderr>:tensorflow.python.framework.errors_impl.InternalError: in user code:
[3]<stderr>:
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/training.py:247 _forward  *
[3]<stderr>:        target,
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/training.py:329 _forward  *
[3]<stderr>:        loss, gradients = self._compute_gradients(
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/training.py:311 _compute_gradients  *
[3]<stderr>:        reported_loss, gradients = self._model.compute_gradients(
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/models/model.py:223 _compute_loss  *
[3]<stderr>:        train_loss, report_loss = self.compute_training_loss(
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/models/model.py:263 compute_training_loss  *
[3]<stderr>:        outputs, _ = self(features, labels, training=True, step=step)
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/models/model.py:102 __call__  *
[3]<stderr>:        outputs, predictions = super().__call__(
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1023 __call__  **
[3]<stderr>:        self._maybe_build(inputs)
[3]<stderr>:    /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:2625 _maybe_build
[3]<stderr>:        self.build(input_shapes)  # pylint:disable=not-callable
"/opt/mt/miniconda3/envs/horovod/bin/horovodrun", line 8, in <module>
    sys.exit(run_commandline())
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 770, in run_commandline
    _run(args)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 760, in _run
    return _run_static(args)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 617, in _run_static
    _launch_job(args, settings, nics, command)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 730, in _launch_job
    run_controller(args.use_gloo, gloo_run_fn,
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 706, in run_controller
    gloo_run()
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 722, in gloo_run_fn
    gloo_run(settings, nics, env, driver_ip, command)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 298, in gloo_run
    launch_gloo(command, exec_command, settings, nics, env, server_ip)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 282, in launch_gloo
    raise RuntimeError('Horovod detected that one or more processes exited with non-zero '
RuntimeError: Horovod detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was:
Process name: 3
Exit code: 1