Advertisement
Guest User

Untitled

a guest
Oct 18th, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 24.69 KB | None | 0 0
  1. Creating tmp8z_8haic_algo-1-vmd10_1 ...
  2. Attaching to tmp8z_8haic_algo-1-vmd10_12mdone
  3. algo-1-vmd10_1 | 2019-09-30 07:54:22,885 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training
  4. algo-1-vmd10_1 | 2019-09-30 07:54:22,892 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
  5. algo-1-vmd10_1 | 2019-09-30 07:54:23,011 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
  6. algo-1-vmd10_1 | 2019-09-30 07:54:23,031 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
  7. algo-1-vmd10_1 | 2019-09-30 07:54:23,045 sagemaker-containers INFO Invoking user script
  8. algo-1-vmd10_1 |
  9. algo-1-vmd10_1 | Training Env:
  10. algo-1-vmd10_1 |
  11. algo-1-vmd10_1 | {
  12. algo-1-vmd10_1 | "additional_framework_parameters": {
  13. algo-1-vmd10_1 | "sagemaker_estimator": "RLEstimator"
  14. algo-1-vmd10_1 | },
  15. algo-1-vmd10_1 | "channel_input_dirs": {},
  16. algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
  17. algo-1-vmd10_1 | "framework_module": "sagemaker_tensorflow_container.training:main",
  18. algo-1-vmd10_1 | "hosts": [
  19. algo-1-vmd10_1 | "algo-1-vmd10"
  20. algo-1-vmd10_1 | ],
  21. algo-1-vmd10_1 | "hyperparameters": {
  22. algo-1-vmd10_1 | "s3_bucket": "sagemaker-us-west-2-123456789012",
  23. algo-1-vmd10_1 | "rl.training.stop.training_iteration": 2,
  24. algo-1-vmd10_1 | "rl.training.checkpoint_freq": 2
  25. algo-1-vmd10_1 | },
  26. algo-1-vmd10_1 | "input_config_dir": "/opt/ml/input/config",
  27. algo-1-vmd10_1 | "input_data_config": {},
  28. algo-1-vmd10_1 | "input_dir": "/opt/ml/input",
  29. algo-1-vmd10_1 | "is_master": true,
  30. algo-1-vmd10_1 | "job_name": "ArrivalSim-2019-09-30-07-53-33-200",
  31. algo-1-vmd10_1 | "log_level": 20,
  32. algo-1-vmd10_1 | "master_hostname": "algo-1-vmd10",
  33. algo-1-vmd10_1 | "model_dir": "/opt/ml/model",
  34. algo-1-vmd10_1 | "module_dir": "s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz",
  35. algo-1-vmd10_1 | "module_name": "mod_op_train",
  36. algo-1-vmd10_1 | "network_interface_name": "eth0",
  37. algo-1-vmd10_1 | "num_cpus": 2,
  38. algo-1-vmd10_1 | "num_gpus": 0,
  39. algo-1-vmd10_1 | "output_data_dir": "/opt/ml/output/data",
  40. algo-1-vmd10_1 | "output_dir": "/opt/ml/output",
  41. algo-1-vmd10_1 | "output_intermediate_dir": "/opt/ml/output/intermediate",
  42. algo-1-vmd10_1 | "resource_config": {
  43. algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
  44. algo-1-vmd10_1 | "hosts": [
  45. algo-1-vmd10_1 | "algo-1-vmd10"
  46. algo-1-vmd10_1 | ]
  47. algo-1-vmd10_1 | },
  48. algo-1-vmd10_1 | "user_entry_point": "mod_op_train.py"
  49. algo-1-vmd10_1 | }
  50. algo-1-vmd10_1 |
  51. algo-1-vmd10_1 | Environment variables:
  52. algo-1-vmd10_1 |
  53. algo-1-vmd10_1 | SM_HOSTS=["algo-1-vmd10"]
  54. algo-1-vmd10_1 | SM_NETWORK_INTERFACE_NAME=eth0
  55. algo-1-vmd10_1 | SM_HPS={"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"}
  56. algo-1-vmd10_1 | SM_USER_ENTRY_POINT=mod_op_train.py
  57. algo-1-vmd10_1 | SM_FRAMEWORK_PARAMS={"sagemaker_estimator":"RLEstimator"}
  58. algo-1-vmd10_1 | SM_RESOURCE_CONFIG={"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]}
  59. algo-1-vmd10_1 | SM_INPUT_DATA_CONFIG={}
  60. algo-1-vmd10_1 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
  61. algo-1-vmd10_1 | SM_CHANNELS=[]
  62. algo-1-vmd10_1 | SM_CURRENT_HOST=algo-1-vmd10
  63. algo-1-vmd10_1 | SM_MODULE_NAME=mod_op_train
  64. algo-1-vmd10_1 | SM_LOG_LEVEL=20
  65. algo-1-vmd10_1 | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main
  66. algo-1-vmd10_1 | SM_INPUT_DIR=/opt/ml/input
  67. algo-1-vmd10_1 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
  68. algo-1-vmd10_1 | SM_OUTPUT_DIR=/opt/ml/output
  69. algo-1-vmd10_1 | SM_NUM_CPUS=2
  70. algo-1-vmd10_1 | SM_NUM_GPUS=0
  71. algo-1-vmd10_1 | SM_MODEL_DIR=/opt/ml/model
  72. algo-1-vmd10_1 | SM_MODULE_DIR=s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz
  73. algo-1-vmd10_1 | SM_TRAINING_ENV={"additional_framework_parameters":{"sagemaker_estimator":"RLEstimator"},"channel_input_dirs":{},"current_host":"algo-1-vmd10","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-vmd10"],"hyperparameters":{"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"ArrivalSim-2019-09-30-07-53-33-200","log_level":20,"master_hostname":"algo-1-vmd10","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz","module_name":"mod_op_train","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]},"user_entry_point":"mod_op_train.py"}
  74. algo-1-vmd10_1 | SM_USER_ARGS=["--rl.training.checkpoint_freq","2","--rl.training.stop.training_iteration","2","--s3_bucket","sagemaker-us-west-2-123456789012"]
  75. algo-1-vmd10_1 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
  76. algo-1-vmd10_1 | SM_HP_S3_BUCKET=sagemaker-us-west-2-123456789012
  77. algo-1-vmd10_1 | SM_HP_RL.TRAINING.STOP.TRAINING_ITERATION=2
  78. algo-1-vmd10_1 | SM_HP_RL.TRAINING.CHECKPOINT_FREQ=2
  79. algo-1-vmd10_1 | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python36.zip:/usr/lib/python3.6:/usr/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/dist-packages:/usr/lib/python3/dist-packages
  80. algo-1-vmd10_1 |
  81. algo-1-vmd10_1 | Invoking script with the following command:
  82. algo-1-vmd10_1 |
  83. algo-1-vmd10_1 | /usr/bin/python mod_op_train.py --rl.training.checkpoint_freq 2 --rl.training.stop.training_iteration 2 --s3_bucket sagemaker-us-west-2-123456789012
  84. algo-1-vmd10_1 |
  85. algo-1-vmd10_1 |
  86. algo-1-vmd10_1 | {'monitor': False, 'log_level': 'INFO', 'callbacks': {'on_episode_start': None, 'on_episode_step': None, 'on_episode_end': None, 'on_sample_end': None, 'on_train_result': None}, 'ignore_worker_failures': False, 'model': {'conv_filters': None, 'conv_activation': 'relu', 'fcnet_activation': 'tanh', 'fcnet_hiddens': [256, 256], 'free_log_std': False, 'squash_to_range': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_preprocessor': None, 'custom_model': None, 'custom_options': {}}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'env_config': {}, 'env': None, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'num_workers': 2, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'num_envs_per_worker': 1, 'sample_batch_size': 200, 'train_batch_size': 4000, 'batch_mode': 'truncate_episodes', 'sample_async': False, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_evaluator_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'async_remote_worker_envs': False, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policy_graphs': {}, 'policy_mapping_fn': None, 'policies_to_train': None}, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'num_sgd_iter': 30, 'lr': 5e-05, 'lr_schedule': None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': False, 'straggler_mitigation': False}
  87. algo-1-vmd10_1 | 2019-09-30 07:54:30,715 WARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.
  88. algo-1-vmd10_1 | 2019-09-30 07:54:30,716 INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-30_07-54-30_51/logs.
  89. algo-1-vmd10_1 | 2019-09-30 07:54:30,823 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:45224 to respond...
  90. algo-1-vmd10_1 | 2019-09-30 07:54:30,934 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:39871 to respond...
  91. algo-1-vmd10_1 | 2019-09-30 07:54:30,936 INFO services.py:760 -- Starting Redis shard with 0.83 GB max memory.
  92. algo-1-vmd10_1 | 2019-09-30 07:54:30,951 WARNING services.py:1261 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag '--shm-size' to 'docker run'.
  93. algo-1-vmd10_1 | 2019-09-30 07:54:30,951 INFO services.py:1384 -- Starting the Plasma object store with 1.24 GB memory using /tmp.
  94. algo-1-vmd10_1 | Running experiment with config {
  95. algo-1-vmd10_1 | "training": {
  96. algo-1-vmd10_1 | "env": "ArrivalSim-v0",
  97. algo-1-vmd10_1 | "run": "PPO",
  98. algo-1-vmd10_1 | "stop": {
  99. algo-1-vmd10_1 | "training_iteration": 2
  100. algo-1-vmd10_1 | },
  101. algo-1-vmd10_1 | "local_dir": "/opt/ml/output/intermediate",
  102. algo-1-vmd10_1 | "checkpoint_freq": 10,
  103. algo-1-vmd10_1 | "config": {
  104. algo-1-vmd10_1 | "num_workers": 1,
  105. algo-1-vmd10_1 | "train_batch_size": 128,
  106. algo-1-vmd10_1 | "sample_batch_size": 32,
  107. algo-1-vmd10_1 | "optimizer": {
  108. algo-1-vmd10_1 | "grads_per_step": 10
  109. algo-1-vmd10_1 | }
  110. algo-1-vmd10_1 | },
  111. algo-1-vmd10_1 | "checkpoint_at_end": true
  112. algo-1-vmd10_1 | }
  113. algo-1-vmd10_1 | }
  114. algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:64 -- Did not find checkpoint file in /opt/ml/output/intermediate/training.
  115. algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:211 -- Starting a new experiment.
  116. algo-1-vmd10_1 | == Status ==
  117. algo-1-vmd10_1 | Using FIFO scheduling algorithm.
  118. algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
  119. algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
  120. algo-1-vmd10_1 |
  121. algo-1-vmd10_1 | == Status ==
  122. algo-1-vmd10_1 | Using FIFO scheduling algorithm.
  123. algo-1-vmd10_1 | Resources requested: 2/3 CPUs, 0/0 GPUs
  124. algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
  125. algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
  126. algo-1-vmd10_1 | Number of trials: 1 ({'RUNNING': 1})
  127. algo-1-vmd10_1 | RUNNING trials:
  128. algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: RUNNING
  129. algo-1-vmd10_1 |
  130. algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,765 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
  131. algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,776 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  132. algo-1-vmd10_1 | (pid=72) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  133. algo-1-vmd10_1 | (pid=72) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  134. algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:40,860 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
  135. algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:44,007 INFO ppo.py:105 -- Important! Since 0.7.0, observation normalization is no longer enabled by default. To enable running-mean normalization, set 'observation_filter': 'MeanStdFilter'. You can ignore this message if your environment doesn't require observation normalization.
  136. algo-1-vmd10_1 | (pid=97) 2019-09-30 07:54:48,310 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
  137. algo-1-vmd10_1 | (pid=97) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  138. algo-1-vmd10_1 | (pid=97) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  139.  
  140. algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
  141. algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
  142. algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
  143. algo-1-vmd10_1 | (pid=97) [1.83663046] 0.2754945689088046 False {}
  144. algo-1-vmd10_1 | (pid=97) price b = 1.8366304593920306
  145. algo-1-vmd10_1 | (pid=97) price a = 1.5866304593920306
  146. algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
  147. algo-1-vmd10_1 | (pid=97) [1.58663046] 0.2538608735027249 False {}
  148. algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
  149. algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
  150. algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
  151. algo-1-vmd10_1 | (pid=97) [1.83663046] 0.23876195972096398 False {}
  152.  
  153. algo-1-vmd10_1 | Result for PPO_ArrivalSim-v0_0:
  154. algo-1-vmd10_1 | custom_metrics: {}
  155. algo-1-vmd10_1 | date: 2019-09-30_07-54-52
  156. algo-1-vmd10_1 | done: true
  157. algo-1-vmd10_1 | episode_len_mean: 13.9375
  158. algo-1-vmd10_1 | episode_reward_max: 15.623709832898886
  159. algo-1-vmd10_1 | episode_reward_mean: 2.1431919362241683
  160. algo-1-vmd10_1 | episode_reward_min: 0.0
  161. algo-1-vmd10_1 | episodes_this_iter: 8
  162. algo-1-vmd10_1 | episodes_total: 16
  163. algo-1-vmd10_1 | experiment_id: e401acafb745453a93cd07c23a49719a
  164. algo-1-vmd10_1 | hostname: 912222cf3d36
  165. algo-1-vmd10_1 | info:
  166. algo-1-vmd10_1 | default:
  167. algo-1-vmd10_1 | cur_kl_coeff: 0.30000001192092896
  168. algo-1-vmd10_1 | cur_lr: 4.999999873689376e-05
  169. algo-1-vmd10_1 | entropy: 1.0779298543930054
  170. algo-1-vmd10_1 | kl: 3.3599588871002197
  171. algo-1-vmd10_1 | policy_loss: -0.00849771499633789
  172. algo-1-vmd10_1 | total_loss: 9.052791595458984
  173. algo-1-vmd10_1 | vf_explained_var: 0.06744426488876343
  174. algo-1-vmd10_1 | vf_loss: 8.053301811218262
  175. algo-1-vmd10_1 | grad_time_ms: 1050.643
  176. algo-1-vmd10_1 | load_time_ms: 34.049
  177. algo-1-vmd10_1 | num_steps_sampled: 256
  178. algo-1-vmd10_1 | num_steps_trained: 256
  179. algo-1-vmd10_1 | sample_time_ms: 2921.621
  180. algo-1-vmd10_1 | update_time_ms: 214.194
  181. algo-1-vmd10_1 | iterations_since_restore: 2
  182. algo-1-vmd10_1 | node_ip: 172.18.0.2
  183. algo-1-vmd10_1 | num_healthy_workers: 1
  184. algo-1-vmd10_1 | num_metric_batches_dropped: 0
  185. algo-1-vmd10_1 | off_policy_estimator: {}
  186. algo-1-vmd10_1 | pid: 72
  187. algo-1-vmd10_1 | policy_reward_mean: {}
  188. algo-1-vmd10_1 | time_since_restore: 8.488733053207397
  189. algo-1-vmd10_1 | time_this_iter_s: 1.0875024795532227
  190. algo-1-vmd10_1 | time_total_s: 8.488733053207397
  191. algo-1-vmd10_1 | timestamp: 1569830092
  192. algo-1-vmd10_1 | timesteps_since_restore: 256
  193. algo-1-vmd10_1 | timesteps_this_iter: 128
  194. algo-1-vmd10_1 | timesteps_total: 256
  195. algo-1-vmd10_1 | training_iteration: 2
  196. algo-1-vmd10_1 |
  197. algo-1-vmd10_1 | 2019-09-30 07:54:52,557 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
  198. algo-1-vmd10_1 | == Status ==
  199. algo-1-vmd10_1 | Using FIFO scheduling algorithm.
  200. algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
  201. algo-1-vmd10_1 | Memory usage on this node: 2.0/4.1 GB
  202. algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
  203. algo-1-vmd10_1 | Number of trials: 1 ({'TERMINATED': 1})
  204. algo-1-vmd10_1 | TERMINATED trials:
  205. algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: TERMINATED, [2 CPUs, 0 GPUs], [pid=72], 8 s, 2 iter, 256 ts, 2.14 rew
  206. algo-1-vmd10_1 |
  207. algo-1-vmd10_1 | Saved model configuration.
  208. algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2 as /opt/ml/model/checkpoint
  209. algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2.tune_metadata as /opt/ml/model/checkpoint.tune_metadata
  210. algo-1-vmd10_1 | 2019-09-30 07:54:57,605 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
  211. algo-1-vmd10_1 | 2019-09-30 07:54:57,607 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  212. algo-1-vmd10_1 | /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  213. algo-1-vmd10_1 | "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  214. algo-1-vmd10_1 | 2019-09-30 07:54:58,769 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
  215. algo-1-vmd10_1 | (pid=73) 2019-09-30 07:54:59,065 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
  216. algo-1-vmd10_1 | (pid=73) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  217. algo-1-vmd10_1 | (pid=73) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  218. algo-1-vmd10_1 | Saved TensorFlow serving model!
  219. algo-1-vmd10_1 | 2019-09-30 07:55:03,775 sagemaker-containers INFO Reporting training SUCCESS
  220. tmp8z_8haic_algo-1-vmd10_1 exited with code 0
  221. Aborting on container exit...
  222. ---------------------------------------------------------------------------
  223. PermissionError Traceback (most recent call last)
  224. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
  225. 28 try:
  226. ---> 29 fsrc = open(src, 'rb')
  227. 30 except OSError as e:
  228.  
  229. PermissionError: [Errno 13] Permission denied: '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log'
  230.  
  231. During handling of the above exception, another exception occurred:
  232.  
  233. DistutilsFileError Traceback (most recent call last)
  234. <ipython-input-5-abacdc7913fc> in <module>()
  235. 34 )
  236. 35
  237. ---> 36 estimator.fit()
  238.  
  239. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
  240. 337 self._prepare_for_training(job_name=job_name)
  241. 338
  242. --> 339 self.latest_training_job = _TrainingJob.start_new(self, inputs)
  243. 340 if wait:
  244. 341 self.latest_training_job.wait(logs=logs)
  245.  
  246. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs)
  247. 856 cls._add_spot_checkpoint_args(local_mode, estimator, train_args)
  248. 857
  249. --> 858 estimator.sagemaker_session.train(**train_args)
  250. 859
  251. 860 return cls(estimator.sagemaker_session, estimator._current_job_name)
  252.  
  253. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)
  254. 390 LOGGER.info("Creating training-job with name: %s", job_name)
  255. 391 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
  256. --> 392 self.sagemaker_client.create_training_job(**train_request)
  257. 393
  258. 394 def compile_model(
  259.  
  260. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
  261. 99 training_job = _LocalTrainingJob(container)
  262. 100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
  263. --> 101 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
  264. 102
  265. 103 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
  266.  
  267. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
  268. 87
  269. 88 self.model_artifacts = self.container.train(
  270. ---> 89 input_data_config, output_data_config, hyperparameters, job_name
  271. 90 )
  272. 91 self.end_time = datetime.datetime.now()
  273.  
  274. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
  275. 153 raise RuntimeError(msg)
  276. 154 finally:
  277. --> 155 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
  278. 156
  279. 157 # free up the training data directory as it may contain
  280.  
  281. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in retrieve_artifacts(self, compose_data, output_data_config, job_name)
  282. 253 sagemaker.local.utils.recursive_copy(host_dir, model_artifacts)
  283. 254 elif container_dir == "/opt/ml/output":
  284. --> 255 sagemaker.local.utils.recursive_copy(host_dir, output_artifacts)
  285. 256
  286. 257 # Tar Artifacts -> model.tar.gz and output.tar.gz
  287.  
  288. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/utils.py in recursive_copy(source, destination)
  289. 82 """
  290. 83 if os.path.isdir(source):
  291. ---> 84 copy_tree(source, destination)
  292.  
  293. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  294. 157 copy_tree(src_name, dst_name, preserve_mode,
  295. 158 preserve_times, preserve_symlinks, update,
  296. --> 159 verbose=verbose, dry_run=dry_run))
  297. 160 else:
  298. 161 copy_file(src_name, dst_name, preserve_mode,
  299.  
  300. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  301. 157 copy_tree(src_name, dst_name, preserve_mode,
  302. 158 preserve_times, preserve_symlinks, update,
  303. --> 159 verbose=verbose, dry_run=dry_run))
  304. 160 else:
  305. 161 copy_file(src_name, dst_name, preserve_mode,
  306.  
  307. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  308. 157 copy_tree(src_name, dst_name, preserve_mode,
  309. 158 preserve_times, preserve_symlinks, update,
  310. --> 159 verbose=verbose, dry_run=dry_run))
  311. 160 else:
  312. 161 copy_file(src_name, dst_name, preserve_mode,
  313.  
  314. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  315. 161 copy_file(src_name, dst_name, preserve_mode,
  316. 162 preserve_times, update, verbose=verbose,
  317. --> 163 dry_run=dry_run)
  318. 164 outputs.append(dst_name)
  319. 165
  320.  
  321. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in copy_file(src, dst, preserve_mode, preserve_times, update, link, verbose, dry_run)
  322. 149 # Otherwise (non-Mac, not linking), copy the file contents and
  323. 150 # (optionally) copy the times and mode.
  324. --> 151 _copy_file_contents(src, dst)
  325. 152 if preserve_mode or preserve_times:
  326. 153 st = os.stat(src)
  327.  
  328. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
  329. 29 fsrc = open(src, 'rb')
  330. 30 except OSError as e:
  331. ---> 31 raise DistutilsFileError("could not open '%s': %s" % (src, e.strerror))
  332. 32
  333. 33 if os.path.exists(dst):
  334.  
  335. DistutilsFileError: could not open '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log': Permission denied
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement