error.log

"notebooks/pipeline/train.py", line 35, in <module>
[3]<stderr>:    Trainer(**config['train']['trainer']).run()
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
[3]<stderr>:    raise e
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
[3]<stderr>:    function_return_value = f(*args, **kwargs)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
[3]<stderr>:    final_model_dir, train_summary = runner.train(
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
[3]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
[3]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
[3]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
[3]<stderr>:    self.ensure_initialized()
[3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
[2]<stderr>:    Trainer(**config['train']['trainer']).run()
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
[2]<stderr>:    raise e
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
[2]<stderr>:    function_return_value = f(*args, **kwargs)
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
[2]<stderr>:    final_model_dir, train_summary = runner.train(
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
[2]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
[2]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
[2]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
[2]<stderr>:    self.ensure_initialized()
[2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
[0]<stderr>:          )
[1]<stderr>:    Trainer(**config['train']['trainer']).run()
[0]<stderr>:          (input_layer_norm): LayerNorm()
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
[0]<stderr>:        )
[1]<stderr>:    raise e
[0]<stderr>:      )
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
[0]<stderr>:      (5): SelfAttentionDecoderLayer(
[1]<stderr>:    function_return_value = f(*args, **kwargs)
[0]<stderr>:        (self_attention): TransformerLayerWrapper(
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
[0]<stderr>:          (layer): MultiHeadAttention(
[1]<stderr>:    final_model_dir, train_summary = runner.train(
[0]<stderr>:            (linear_queries): Dense(1024)
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
[0]<stderr>:            (linear_keys): Dense(1024)
[1]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
[0]<stderr>:            (linear_values): Dense(1024)
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
[0]<stderr>:            (linear_output): Dense(1024)
[1]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
[0]<stderr>:          )
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
[0]<stderr>:          (input_layer_norm): LayerNorm()
[1]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
[0]<stderr>:        )
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
[0]<stderr>:        (attention): ListWrapper(
[1]<stderr>:    self.ensure_initialized()
[0]<stderr>:          (0): TransformerLayerWrapper(
[1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 525, in ensure_initialized
[0]<stderr>:            (layer): MultiHeadAttention(
[1]<stderr>:    context_handle = pywrap_tfe.TFE_NewContext(opts)
[0]<stderr>:              (linear_queries): Dense(1024)
[1]<stderr>:tensorflow.python.framework.errors_impl.AlreadyExistsError: TensorFlow device (GPU:0) is being mapped to multiple devices (1 now, and 0 previously), which is not supported. This may be the result of providing different GPU configurations (ConfigProto.gpu_options, for example different visible_device_list) when creating multiple Sessions in the same process. This is not currently supported, see https://github.com/tensorflow/tensorflow/issues/19083
[0]<stderr>:              (linear_keys): Dense(1024)
[0]<stderr>:              (linear_values): Dense(1024)
[0]<stderr>:              (linear_output): Dense(1024)
[0]<stderr>:            )
[0]<stderr>:            (input_layer_norm): LayerNorm()
[0]<stderr>:          )
[0]<stderr>:        )
[0]<stderr>:        (ffn): TransformerLayerWrapper(
[0]<stderr>:          (layer): FeedForwardNetwork(
[0]<stderr>:            (inner): Dense(4096)
[0]<stderr>:            (outer): Dense(1024)
[0]<stderr>:          )
[0]<stderr>:          (input_layer_norm): LayerNorm()
[0]<stderr>:        )
[0]<stderr>:      )
[0]<stderr>:    )
[0]<stderr>:  )
[0]<stderr>:)
[0]<stderr>:
[0]<stderr>:2021-11-24 11:35:54.862523: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
[0]<stderr>:pciBusID: 0000:8e:00.0 name: Quadro RTX 6000 computeCapability: 7.5
[0]<stderr>:coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 23.65GiB deviceMemoryBandwidth: 625.94GiB/s
[0]<stderr>:2021-11-24 11:35:54.865344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
[0]<stderr>:2021-11-24 11:35:54.865375: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
[0]<stderr>:2021-11-24 11:35:54.865379: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0
[0]<stderr>:2021-11-24 11:35:54.865383: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N
[0]<stderr>:2021-11-24 11:35:54.870941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 21803 MB memory) -> physical GPU (device: 0, name: Quadro RTX 6000, pci bus id: 0000:8e:00.0, compute capability: 7.5)
[0]<stderr>:INFO:tensorflow:Using parameters:
[0]<stderr>:data:
[0]<stderr>:  eval_features_file: gen_enfr/data_src.valid.tok
[0]<stderr>:  eval_labels_file: gen_enfr/data_tgt.valid.tok
[0]<stderr>:  source_vocabulary: gen_enfr/bpe_src.vocab
[0]<stderr>:  target_vocabulary: gen_enfr/bpe_tgt.vocab
[0]<stderr>:  train_features_file: gen_enfr/data_src.train.tok
[0]<stderr>:  train_labels_file: gen_enfr/data_tgt.train.tok
[0]<stderr>:eval:
[0]<stderr>:  batch_size: 32
[0]<stderr>:  batch_type: examples
[0]<stderr>:  early_stopping:
[0]<stderr>:    metric: bleu
[0]<stderr>:    min_improvement: 0.01
[0]<stderr>:    steps: 3
[0]<stderr>:  external_evaluators: BLEU
[0]<stderr>:  length_bucket_width: 5
[0]<stderr>:  save_eval_predictions: false
[0]<stderr>:  steps: 5000
[0]<stderr>:infer:
[0]<stderr>:  batch_size: 32
[0]<stderr>:  batch_type: examples
[0]<stderr>:  bucket_width: 5
[0]<stderr>:  length_bucket_width: 5
[0]<stderr>:model_dir: gen_enfr/run
[0]<stderr>:params:
[0]<stderr>:  average_loss_in_time: true
[0]<stderr>:  beam_width: 2
[0]<stderr>:  contrastive_learning: false
[0]<stderr>:  coverage_penalty: 0
[0]<stderr>:  decay_params:
[0]<stderr>:    model_dim: 1024
[0]<stderr>:    warmup_steps: 8000
[0]<stderr>:  decay_type: NoamDecay
[0]<stderr>:  decoding_subword_token: "\uFFED"
[0]<stderr>:  decoding_subword_token_is_spacer: false
[0]<stderr>:  label_smoothing: 0.1
[0]<stderr>:  learning_rate: 1.0
[0]<stderr>:  length_penalty: 0.6
[0]<stderr>:  max_margin_eta: 0.1
[0]<stderr>:  maximum_decoding_length: 256
[0]<stderr>:  num_hypotheses: 1
[0]<stderr>:  optimizer: Adam
[0]<stderr>:  optimizer_params:
[0]<stderr>:    beta_1: 0.9
[0]<stderr>:    beta_2: 0.998
[0]<stderr>:score:
[0]<stderr>:  batch_size: 64
[0]<stderr>:  batch_type: examples
[0]<stderr>:  length_bucket_width: 5
[0]<stderr>:train:
[0]<stderr>:  average_last_checkpoints: 8
[0]<stderr>:  batch_size: 4096
[0]<stderr>:  batch_type: tokens
[0]<stderr>:  effective_batch_size: 25000
[0]<stderr>:  keep_checkpoint_max: 8
[0]<stderr>:  length_bucket_width: 1
[0]<stderr>:  max_step: 200000
[0]<stderr>:  maximum_features_length: 256
[0]<stderr>:  maximum_labels_length: 256
[0]<stderr>:  moving_average_decay: 0.9999
[0]<stderr>:  replace_unknown_target: true
[0]<stderr>:  sample_buffer_size: 500000
[0]<stderr>:  save_checkpoints_steps: 1000
[0]<stderr>:  save_summary_steps: 200
[0]<stderr>:  single_pass: false
[0]<stderr>:
[0]<stderr>:INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
[0]<stderr>:Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
[0]<stderr>:INFO:tensorflow:Initialized source input layer:
[0]<stderr>:INFO:tensorflow: - vocabulary size: 34608
[0]<stderr>:INFO:tensorflow: - special tokens: BOS=no, EOS=no
[0]<stderr>:INFO:tensorflow:Initialized target input layer:
[0]<stderr>:INFO:tensorflow: - vocabulary size: 42992
[0]<stderr>:INFO:tensorflow: - special tokens: BOS=yes, EOS=yes
[0]<stderr>:INFO:tensorflow:Restored checkpoint gen_enfr/run/ckpt-1
[0]<stderr>:WARNING:tensorflow:From /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/summary/summary_iterator.py:31: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
[0]<stderr>:Instructions for updating:
[0]<stderr>:Use eager execution and:
[0]<stderr>:`tf.data.TFRecordDataset(path)`
[0]<stderr>:Terminated
Traceback (most recent call last):
  File "/opt/mt/miniconda3/envs/horovod/bin/horovodrun", line 8, in <module>
    sys.exit(run_commandline())
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 770, in run_commandline
    _run(args)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 760, in _run
    return _run_static(args)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 617, in _run_static
    _launch_job(args, settings, nics, command)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 730, in _launch_job
    run_controller(args.use_gloo, gloo_run_fn,
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 706, in run_controller
    gloo_run()
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 722, in gloo_run_fn
    gloo_run(settings, nics, env, driver_ip, command)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 298, in gloo_run
    launch_gloo(command, exec_command, settings, nics, env, server_ip)
  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 282, in launch_gloo
    raise RuntimeError('Horovod detected that one or more processes exited with non-zero '
RuntimeError: Horovod detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was:
Process name: 2
Exit code: 1