Advertisement
Guest User

error.log

a guest
Nov 24th, 2021
54
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. "notebooks/pipeline/train.py", line 35, in <module>
  2. [3]<stderr>:    Trainer(**config['train']['trainer']).run()
  3. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
  4. [3]<stderr>:    raise e
  5. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
  6. [3]<stderr>:    function_return_value = f(*args, **kwargs)
  7. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
  8. [3]<stderr>:    final_model_dir, train_summary = runner.train(
  9. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
  10. [3]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
  11. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
  12. [3]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
  13. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
  14. [3]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
  15. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
  16. [3]<stderr>:    self.ensure_initialized()
  17. [3]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
  18. [2]<stderr>:    Trainer(**config['train']['trainer']).run()
  19. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
  20. [2]<stderr>:    raise e
  21. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
  22. [2]<stderr>:    function_return_value = f(*args, **kwargs)
  23. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
  24. [2]<stderr>:    final_model_dir, train_summary = runner.train(
  25. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
  26. [2]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
  27. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
  28. [2]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
  29. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
  30. [2]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
  31. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
  32. [2]<stderr>:    self.ensure_initialized()
  33. [2]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py""notebooks/pipeline/train.py", line 35, in <module>
  34. [0]<stderr>:          )
  35. [1]<stderr>:    Trainer(**config['train']['trainer']).run()
  36. [0]<stderr>:          (input_layer_norm): LayerNorm()
  37. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 34, in wrapped_f
  38. [0]<stderr>:        )
  39. [1]<stderr>:    raise e
  40. [0]<stderr>:      )
  41. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/common/pipeline.py", line 30, in wrapped_f
  42. [0]<stderr>:      (5): SelfAttentionDecoderLayer(
  43. [1]<stderr>:    function_return_value = f(*args, **kwargs)
  44. [0]<stderr>:        (self_attention): TransformerLayerWrapper(
  45. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/cdtnice/train/train.py", line 48, in run
  46. [0]<stderr>:          (layer): MultiHeadAttention(
  47. [1]<stderr>:    final_model_dir, train_summary = runner.train(
  48. [0]<stderr>:            (linear_queries): Dense(1024)
  49. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/runner.py", line 199, in train
  50. [0]<stderr>:            (linear_keys): Dense(1024)
  51. [1]<stderr>:    devices = misc.get_devices(count=num_devices, fallback_to_cpu=fallback_to_cpu)
  52. [0]<stderr>:            (linear_values): Dense(1024)
  53. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/opennmt/utils/misc.py", line 33, in get_devices
  54. [0]<stderr>:            (linear_output): Dense(1024)
  55. [1]<stderr>:    devices = tf.config.list_logical_devices(device_type=device_type)
  56. [0]<stderr>:          )
  57. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/framework/config.py", line 452, in list_logical_devices
  58. [0]<stderr>:          (input_layer_norm): LayerNorm()
  59. [1]<stderr>:    return context.context().list_logical_devices(device_type=device_type)
  60. [0]<stderr>:        )
  61. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 1395, in list_logical_devices
  62. [0]<stderr>:        (attention): ListWrapper(
  63. [1]<stderr>:    self.ensure_initialized()
  64. [0]<stderr>:          (0): TransformerLayerWrapper(
  65. [1]<stderr>:  File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/eager/context.py", line 525, in ensure_initialized
  66. [0]<stderr>:            (layer): MultiHeadAttention(
  67. [1]<stderr>:    context_handle = pywrap_tfe.TFE_NewContext(opts)
  68. [0]<stderr>:              (linear_queries): Dense(1024)
  69. [1]<stderr>:tensorflow.python.framework.errors_impl.AlreadyExistsError: TensorFlow device (GPU:0) is being mapped to multiple devices (1 now, and 0 previously), which is not supported. This may be the result of providing different GPU configurations (ConfigProto.gpu_options, for example different visible_device_list) when creating multiple Sessions in the same process. This is not currently supported, see https://github.com/tensorflow/tensorflow/issues/19083
  70. [0]<stderr>:              (linear_keys): Dense(1024)
  71. [0]<stderr>:              (linear_values): Dense(1024)
  72. [0]<stderr>:              (linear_output): Dense(1024)
  73. [0]<stderr>:            )
  74. [0]<stderr>:            (input_layer_norm): LayerNorm()
  75. [0]<stderr>:          )
  76. [0]<stderr>:        )
  77. [0]<stderr>:        (ffn): TransformerLayerWrapper(
  78. [0]<stderr>:          (layer): FeedForwardNetwork(
  79. [0]<stderr>:            (inner): Dense(4096)
  80. [0]<stderr>:            (outer): Dense(1024)
  81. [0]<stderr>:          )
  82. [0]<stderr>:          (input_layer_norm): LayerNorm()
  83. [0]<stderr>:        )
  84. [0]<stderr>:      )
  85. [0]<stderr>:    )
  86. [0]<stderr>:  )
  87. [0]<stderr>:)
  88. [0]<stderr>:
  89. [0]<stderr>:2021-11-24 11:35:54.862523: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties:
  90. [0]<stderr>:pciBusID: 0000:8e:00.0 name: Quadro RTX 6000 computeCapability: 7.5
  91. [0]<stderr>:coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 23.65GiB deviceMemoryBandwidth: 625.94GiB/s
  92. [0]<stderr>:2021-11-24 11:35:54.865344: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
  93. [0]<stderr>:2021-11-24 11:35:54.865375: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
  94. [0]<stderr>:2021-11-24 11:35:54.865379: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0
  95. [0]<stderr>:2021-11-24 11:35:54.865383: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N
  96. [0]<stderr>:2021-11-24 11:35:54.870941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 21803 MB memory) -> physical GPU (device: 0, name: Quadro RTX 6000, pci bus id: 0000:8e:00.0, compute capability: 7.5)
  97. [0]<stderr>:INFO:tensorflow:Using parameters:
  98. [0]<stderr>:data:
  99. [0]<stderr>:  eval_features_file: gen_enfr/data_src.valid.tok
  100. [0]<stderr>:  eval_labels_file: gen_enfr/data_tgt.valid.tok
  101. [0]<stderr>:  source_vocabulary: gen_enfr/bpe_src.vocab
  102. [0]<stderr>:  target_vocabulary: gen_enfr/bpe_tgt.vocab
  103. [0]<stderr>:  train_features_file: gen_enfr/data_src.train.tok
  104. [0]<stderr>:  train_labels_file: gen_enfr/data_tgt.train.tok
  105. [0]<stderr>:eval:
  106. [0]<stderr>:  batch_size: 32
  107. [0]<stderr>:  batch_type: examples
  108. [0]<stderr>:  early_stopping:
  109. [0]<stderr>:    metric: bleu
  110. [0]<stderr>:    min_improvement: 0.01
  111. [0]<stderr>:    steps: 3
  112. [0]<stderr>:  external_evaluators: BLEU
  113. [0]<stderr>:  length_bucket_width: 5
  114. [0]<stderr>:  save_eval_predictions: false
  115. [0]<stderr>:  steps: 5000
  116. [0]<stderr>:infer:
  117. [0]<stderr>:  batch_size: 32
  118. [0]<stderr>:  batch_type: examples
  119. [0]<stderr>:  bucket_width: 5
  120. [0]<stderr>:  length_bucket_width: 5
  121. [0]<stderr>:model_dir: gen_enfr/run
  122. [0]<stderr>:params:
  123. [0]<stderr>:  average_loss_in_time: true
  124. [0]<stderr>:  beam_width: 2
  125. [0]<stderr>:  contrastive_learning: false
  126. [0]<stderr>:  coverage_penalty: 0
  127. [0]<stderr>:  decay_params:
  128. [0]<stderr>:    model_dim: 1024
  129. [0]<stderr>:    warmup_steps: 8000
  130. [0]<stderr>:  decay_type: NoamDecay
  131. [0]<stderr>:  decoding_subword_token: "\uFFED"
  132. [0]<stderr>:  decoding_subword_token_is_spacer: false
  133. [0]<stderr>:  label_smoothing: 0.1
  134. [0]<stderr>:  learning_rate: 1.0
  135. [0]<stderr>:  length_penalty: 0.6
  136. [0]<stderr>:  max_margin_eta: 0.1
  137. [0]<stderr>:  maximum_decoding_length: 256
  138. [0]<stderr>:  num_hypotheses: 1
  139. [0]<stderr>:  optimizer: Adam
  140. [0]<stderr>:  optimizer_params:
  141. [0]<stderr>:    beta_1: 0.9
  142. [0]<stderr>:    beta_2: 0.998
  143. [0]<stderr>:score:
  144. [0]<stderr>:  batch_size: 64
  145. [0]<stderr>:  batch_type: examples
  146. [0]<stderr>:  length_bucket_width: 5
  147. [0]<stderr>:train:
  148. [0]<stderr>:  average_last_checkpoints: 8
  149. [0]<stderr>:  batch_size: 4096
  150. [0]<stderr>:  batch_type: tokens
  151. [0]<stderr>:  effective_batch_size: 25000
  152. [0]<stderr>:  keep_checkpoint_max: 8
  153. [0]<stderr>:  length_bucket_width: 1
  154. [0]<stderr>:  max_step: 200000
  155. [0]<stderr>:  maximum_features_length: 256
  156. [0]<stderr>:  maximum_labels_length: 256
  157. [0]<stderr>:  moving_average_decay: 0.9999
  158. [0]<stderr>:  replace_unknown_target: true
  159. [0]<stderr>:  sample_buffer_size: 500000
  160. [0]<stderr>:  save_checkpoints_steps: 1000
  161. [0]<stderr>:  save_summary_steps: 200
  162. [0]<stderr>:  single_pass: false
  163. [0]<stderr>:
  164. [0]<stderr>:INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
  165. [0]<stderr>:Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
  166. [0]<stderr>:INFO:tensorflow:Initialized source input layer:
  167. [0]<stderr>:INFO:tensorflow: - vocabulary size: 34608
  168. [0]<stderr>:INFO:tensorflow: - special tokens: BOS=no, EOS=no
  169. [0]<stderr>:INFO:tensorflow:Initialized target input layer:
  170. [0]<stderr>:INFO:tensorflow: - vocabulary size: 42992
  171. [0]<stderr>:INFO:tensorflow: - special tokens: BOS=yes, EOS=yes
  172. [0]<stderr>:INFO:tensorflow:Restored checkpoint gen_enfr/run/ckpt-1
  173. [0]<stderr>:WARNING:tensorflow:From /opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/tensorflow/python/summary/summary_iterator.py:31: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
  174. [0]<stderr>:Instructions for updating:
  175. [0]<stderr>:Use eager execution and:
  176. [0]<stderr>:`tf.data.TFRecordDataset(path)`
  177. [0]<stderr>:Terminated
  178. Traceback (most recent call last):
  179.   File "/opt/mt/miniconda3/envs/horovod/bin/horovodrun", line 8, in <module>
  180.     sys.exit(run_commandline())
  181.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 770, in run_commandline
  182.     _run(args)
  183.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 760, in _run
  184.     return _run_static(args)
  185.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 617, in _run_static
  186.     _launch_job(args, settings, nics, command)
  187.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 730, in _launch_job
  188.     run_controller(args.use_gloo, gloo_run_fn,
  189.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 706, in run_controller
  190.     gloo_run()
  191.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/launch.py", line 722, in gloo_run_fn
  192.     gloo_run(settings, nics, env, driver_ip, command)
  193.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 298, in gloo_run
  194.     launch_gloo(command, exec_command, settings, nics, env, server_ip)
  195.   File "/opt/mt/miniconda3/envs/horovod/lib/python3.8/site-packages/horovod/runner/gloo_run.py", line 282, in launch_gloo
  196.     raise RuntimeError('Horovod detected that one or more processes exited with non-zero '
  197. RuntimeError: Horovod detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was:
  198. Process name: 2
  199. Exit code: 1
  200.  
  201.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement