Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- "notebooks/pipeline/train.py", line 35, in <module>
- [3]<stderr>: Trainer(**config['train']['trainer']).run()
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
- [3]<stderr>: raise e
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
- [3]<stderr>: function_return_value = f(*args, **kwargs)
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
- [3]<stderr>: final_model_dir, train_summary = runner.train(
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
- [3]<stderr>: devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
- [3]<stderr>: devices = tf.config.list_logical_devices(device_type=device_type)
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
- [3]<stderr>: return context.context().list_logical_devices(device_type=device_type)
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
- [3]<stderr>: self.ensure_initialized()
- [3]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
- [2]<stderr>: Trainer(**config['train']['trainer']).run()
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
- [2]<stderr>: raise e
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
- [2]<stderr>: function_return_value = f(*args, **kwargs)
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
- [2]<stderr>: final_model_dir, train_summary = runner.train(
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
- [2]<stderr>: devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
- [2]<stderr>: devices = tf.config.list_logical_devices(device_type=device_type)
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
- [2]<stderr>: return context.context().list_logical_devices(device_type=device_type)
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
- [2]<stderr>: self.ensure_initialized()
- [2]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
- [0]<stderr>: )
- [1]<stderr>: Trainer(**config['train']['trainer']).run()
- [0]<stderr>: (input_layer_norm): LayerNorm()
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
- [0]<stderr>: )
- [1]<stderr>: raise e
- [0]<stderr>: )
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
- [0]<stderr>: (5): SelfAttentionDecoderLayer(
- [1]<stderr>: function_return_value = f(*args, **kwargs)
- [0]<stderr>: (self_attention): TransformerLayerWrapper(
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
- [0]<stderr>: (layer): MultiHeadAttention(
- [1]<stderr>: final_model_dir, train_summary = runner.train(
- [0]<stderr>: (linear_queries): Dense(1024)
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
- [0]<stderr>: (linear_keys): Dense(1024)
- [1]<stderr>: devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
- [0]<stderr>: (linear_values): Dense(1024)
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
- [0]<stderr>: (linear_output): Dense(1024)
- [1]<stderr>: devices = tf.config.list_logical_devices(device_type=device_type)
- [0]<stderr>: )
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
- [0]<stderr>: (input_layer_norm): LayerNorm()
- [1]<stderr>: return context.context().list_logical_devices(device_type=device_type)
- [0]<stderr>: )
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
- [0]<stderr>: (attention): ListWrapper(
- [1]<stderr>: self.ensure_initialized()
- [0]<stderr>: (0): TransformerLayerWrapper(
- [1]<stderr>: File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 525, in ensure_initialized
- [0]<stderr>: (layer): MultiHeadAttention(
- [1]<stderr>: context_handle = pywrap_tfe.TFE_NewContext(opts)
- [0]<stderr>: (linear_queries): Dense(1024)
- [1]<stderr>:tensorflow.python.framework.errors_impl.AlreadyExistsError: TensorFlow device (GPU:0) is being mapped to multiple devices (1 now, and 0 previously), which is not supported. This may be the result of providing different GPU configurations (ConfigProto.gpu_options, for example different visible_device_list) when creating multiple Sessions in the same process. This is not currently supported, see https://github.com/tensorflow/tensorflow/issues/19083
- [0]<stderr>: (linear_keys): Dense(1024)
- [0]<stderr>: (linear_values): Dense(1024)
- [0]<stderr>: (linear_output): Dense(1024)
- [0]<stderr>: )
- [0]<stderr>: (input_layer_norm): LayerNorm()
- [0]<stderr>: )
- [0]<stderr>: )
- [0]<stderr>: (ffn): TransformerLayerWrapper(
- [0]<stderr>: (layer): FeedForwardNetwork(
- [0]<stderr>: (inner): Dense(4096)
- [0]<stderr>: (outer): Dense(1024)
- [0]<stderr>: )
- [0]<stderr>: (input_layer_norm): LayerNorm()
- [0]<stderr>: )
- [0]<stderr>: )
- [0]<stderr>: )
- [0]<stderr>: )
- [0]<stderr>:)
- [0]<stderr>:
- [0]<stderr>:2021-11-24 11:35:54.862523: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
- [0]<stderr>:pciBusID: 0000:8e:00.0 name: Quadro RTX 6000 computeCapability: 7.5
- [0]<stderr>:coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 23.65GiB deviceMemoryBandwidth: 625.94GiB/s
- [0]<stderr>:2021-11-24 11:35:54.865344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
- [0]<stderr>:2021-11-24 11:35:54.865375: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
- [0]<stderr>:2021-11-24 11:35:54.865379: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264] 0
- [0]<stderr>:2021-11-24 11:35:54.865383: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0: N
- [0]<stderr>:2021-11-24 11:35:54.870941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 21803 MB memory) -> physical GPU (device: 0, name: Quadro RTX 6000, pci bus id: 0000:8e:00.0, compute capability: 7.5)
- [0]<stderr>:INFO:tensorflow:Using parameters:
- [0]<stderr>:data:
- [0]<stderr>: eval_features_file: gen_enfr/data_src.valid.tok
- [0]<stderr>: eval_labels_file: gen_enfr/data_tgt.valid.tok
- [0]<stderr>: source_vocabulary: gen_enfr/bpe_src.vocab
- [0]<stderr>: target_vocabulary: gen_enfr/bpe_tgt.vocab
- [0]<stderr>: train_features_file: gen_enfr/data_src.train.tok
- [0]<stderr>: train_labels_file: gen_enfr/data_tgt.train.tok
- [0]<stderr>:eval:
- [0]<stderr>: batch_size: 32
- [0]<stderr>: batch_type: examples
- [0]<stderr>: early_stopping:
- [0]<stderr>: metric: bleu
- [0]<stderr>: min_improvement: 0.01
- [0]<stderr>: steps: 3
- [0]<stderr>: external_evaluators: BLEU
- [0]<stderr>: length_bucket_width: 5
- [0]<stderr>: save_eval_predictions: false
- [0]<stderr>: steps: 5000
- [0]<stderr>:infer:
- [0]<stderr>: batch_size: 32
- [0]<stderr>: batch_type: examples
- [0]<stderr>: bucket_width: 5
- [0]<stderr>: length_bucket_width: 5
- [0]<stderr>:model_dir: gen_enfr/run
- [0]<stderr>:params:
- [0]<stderr>: average_loss_in_time: true
- [0]<stderr>: beam_width: 2
- [0]<stderr>: contrastive_learning: false
- [0]<stderr>: coverage_penalty: 0
- [0]<stderr>: decay_params:
- [0]<stderr>: model_dim: 1024
- [0]<stderr>: warmup_steps: 8000
- [0]<stderr>: decay_type: NoamDecay
- [0]<stderr>: decoding_subword_token: "\uFFED"
- [0]<stderr>: decoding_subword_token_is_spacer: false
- [0]<stderr>: label_smoothing: 0.1
- [0]<stderr>: learning_rate: 1.0
- [0]<stderr>: length_penalty: 0.6
- [0]<stderr>: max_margin_eta: 0.1
- [0]<stderr>: maximum_decoding_length: 256
- [0]<stderr>: num_hypotheses: 1
- [0]<stderr>: optimizer: Adam
- [0]<stderr>: optimizer_params:
- [0]<stderr>: beta_1: 0.9
- [0]<stderr>: beta_2: 0.998
- [0]<stderr>:score:
- [0]<stderr>: batch_size: 64
- [0]<stderr>: batch_type: examples
- [0]<stderr>: length_bucket_width: 5
- [0]<stderr>:train:
- [0]<stderr>: average_last_checkpoints: 8
- [0]<stderr>: batch_size: 4096
- [0]<stderr>: batch_type: tokens
- [0]<stderr>: effective_batch_size: 25000
- [0]<stderr>: keep_checkpoint_max: 8
- [0]<stderr>: length_bucket_width: 1
- [0]<stderr>: max_step: 200000
- [0]<stderr>: maximum_features_length: 256
- [0]<stderr>: maximum_labels_length: 256
- [0]<stderr>: moving_average_decay: 0.9999
- [0]<stderr>: replace_unknown_target: true
- [0]<stderr>: sample_buffer_size: 500000
- [0]<stderr>: save_checkpoints_steps: 1000
- [0]<stderr>: save_summary_steps: 200
- [0]<stderr>: single_pass: false
- [0]<stderr>:
- [0]<stderr>:INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
- [0]<stderr>:Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
- [0]<stderr>:INFO:tensorflow:Initialized source input layer:
- [0]<stderr>:INFO:tensorflow: - vocabulary size: 34608
- [0]<stderr>:INFO:tensorflow: - special tokens: BOS=no, EOS=no
- [0]<stderr>:INFO:tensorflow:Initialized target input layer:
- [0]<stderr>:INFO:tensorflow: - vocabulary size: 42992
- [0]<stderr>:INFO:tensorflow: - special tokens: BOS=yes, EOS=yes
- [0]<stderr>:INFO:tensorflow:Restored checkpoint gen_enfr/run/ckpt-1
- [0]<stderr>:WARNING:tensorflow:From /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/summary/summary_iterator.py:31: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
- [0]<stderr>:Instructions for updating:
- [0]<stderr>:Use eager execution and:
- [0]<stderr>:`tf.data.TFRecordDataset(path)`
- [0]<stderr>:Terminated
- Traceback (most recent call last):
- File "/opt/mt/miniconda3/envs/horovod/bin/horovodrun", line 8, in <module>
- sys.exit(run_commandline())
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 770, in run_commandline
- _run(args)
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 760, in _run
- return _run_static(args)
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 617, in _run_static
- _launch_job(args, settings, nics, command)
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 730, in _launch_job
- run_controller(args.use_gloo, gloo_run_fn,
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 706, in run_controller
- gloo_run()
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 722, in gloo_run_fn
- gloo_run(settings, nics, env, driver_ip, command)
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 298, in gloo_run
- launch_gloo(command, exec_command, settings, nics, env, server_ip)
- File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 282, in launch_gloo
- raise RuntimeError('Horovod detected that one or more processes exited with non-zero '
- RuntimeError: Horovod detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was:
- Process name: 2
- Exit code: 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement