SHARE
TWEET

Untitled

a guest Oct 18th, 2019 98 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. Creating tmp8z_8haic_algo-1-vmd10_1 ...
  2. Attaching to tmp8z_8haic_algo-1-vmd10_12mdone
  3. algo-1-vmd10_1  | 2019-09-30 07:54:22,885 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
  4. algo-1-vmd10_1  | 2019-09-30 07:54:22,892 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
  5. algo-1-vmd10_1  | 2019-09-30 07:54:23,011 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
  6. algo-1-vmd10_1  | 2019-09-30 07:54:23,031 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
  7. algo-1-vmd10_1  | 2019-09-30 07:54:23,045 sagemaker-containers INFO     Invoking user script
  8. algo-1-vmd10_1  |
  9. algo-1-vmd10_1  | Training Env:
  10. algo-1-vmd10_1  |
  11. algo-1-vmd10_1  | {
  12. algo-1-vmd10_1  |     "additional_framework_parameters": {
  13. algo-1-vmd10_1  |         "sagemaker_estimator": "RLEstimator"
  14. algo-1-vmd10_1  |     },
  15. algo-1-vmd10_1  |     "channel_input_dirs": {},
  16. algo-1-vmd10_1  |     "current_host": "algo-1-vmd10",
  17. algo-1-vmd10_1  |     "framework_module": "sagemaker_tensorflow_container.training:main",
  18. algo-1-vmd10_1  |     "hosts": [
  19. algo-1-vmd10_1  |         "algo-1-vmd10"
  20. algo-1-vmd10_1  |     ],
  21. algo-1-vmd10_1  |     "hyperparameters": {
  22. algo-1-vmd10_1  |         "s3_bucket": "sagemaker-us-west-2-123456789012",
  23. algo-1-vmd10_1  |         "rl.training.stop.training_iteration": 2,
  24. algo-1-vmd10_1  |         "rl.training.checkpoint_freq": 2
  25. algo-1-vmd10_1  |     },
  26. algo-1-vmd10_1  |     "input_config_dir": "/opt/ml/input/config",
  27. algo-1-vmd10_1  |     "input_data_config": {},
  28. algo-1-vmd10_1  |     "input_dir": "/opt/ml/input",
  29. algo-1-vmd10_1  |     "is_master": true,
  30. algo-1-vmd10_1  |     "job_name": "ArrivalSim-2019-09-30-07-53-33-200",
  31. algo-1-vmd10_1  |     "log_level": 20,
  32. algo-1-vmd10_1  |     "master_hostname": "algo-1-vmd10",
  33. algo-1-vmd10_1  |     "model_dir": "/opt/ml/model",
  34. algo-1-vmd10_1  |     "module_dir": "s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz",
  35. algo-1-vmd10_1  |     "module_name": "mod_op_train",
  36. algo-1-vmd10_1  |     "network_interface_name": "eth0",
  37. algo-1-vmd10_1  |     "num_cpus": 2,
  38. algo-1-vmd10_1  |     "num_gpus": 0,
  39. algo-1-vmd10_1  |     "output_data_dir": "/opt/ml/output/data",
  40. algo-1-vmd10_1  |     "output_dir": "/opt/ml/output",
  41. algo-1-vmd10_1  |     "output_intermediate_dir": "/opt/ml/output/intermediate",
  42. algo-1-vmd10_1  |     "resource_config": {
  43. algo-1-vmd10_1  |         "current_host": "algo-1-vmd10",
  44. algo-1-vmd10_1  |         "hosts": [
  45. algo-1-vmd10_1  |             "algo-1-vmd10"
  46. algo-1-vmd10_1  |         ]
  47. algo-1-vmd10_1  |     },
  48. algo-1-vmd10_1  |     "user_entry_point": "mod_op_train.py"
  49. algo-1-vmd10_1  | }
  50. algo-1-vmd10_1  |
  51. algo-1-vmd10_1  | Environment variables:
  52. algo-1-vmd10_1  |
  53. algo-1-vmd10_1  | SM_HOSTS=["algo-1-vmd10"]
  54. algo-1-vmd10_1  | SM_NETWORK_INTERFACE_NAME=eth0
  55. algo-1-vmd10_1  | SM_HPS={"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"}
  56. algo-1-vmd10_1  | SM_USER_ENTRY_POINT=mod_op_train.py
  57. algo-1-vmd10_1  | SM_FRAMEWORK_PARAMS={"sagemaker_estimator":"RLEstimator"}
  58. algo-1-vmd10_1  | SM_RESOURCE_CONFIG={"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]}
  59. algo-1-vmd10_1  | SM_INPUT_DATA_CONFIG={}
  60. algo-1-vmd10_1  | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
  61. algo-1-vmd10_1  | SM_CHANNELS=[]
  62. algo-1-vmd10_1  | SM_CURRENT_HOST=algo-1-vmd10
  63. algo-1-vmd10_1  | SM_MODULE_NAME=mod_op_train
  64. algo-1-vmd10_1  | SM_LOG_LEVEL=20
  65. algo-1-vmd10_1  | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main
  66. algo-1-vmd10_1  | SM_INPUT_DIR=/opt/ml/input
  67. algo-1-vmd10_1  | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
  68. algo-1-vmd10_1  | SM_OUTPUT_DIR=/opt/ml/output
  69. algo-1-vmd10_1  | SM_NUM_CPUS=2
  70. algo-1-vmd10_1  | SM_NUM_GPUS=0
  71. algo-1-vmd10_1  | SM_MODEL_DIR=/opt/ml/model
  72. algo-1-vmd10_1  | SM_MODULE_DIR=s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz
  73. algo-1-vmd10_1  | SM_TRAINING_ENV={"additional_framework_parameters":{"sagemaker_estimator":"RLEstimator"},"channel_input_dirs":{},"current_host":"algo-1-vmd10","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-vmd10"],"hyperparameters":{"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"ArrivalSim-2019-09-30-07-53-33-200","log_level":20,"master_hostname":"algo-1-vmd10","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz","module_name":"mod_op_train","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]},"user_entry_point":"mod_op_train.py"}
  74. algo-1-vmd10_1  | SM_USER_ARGS=["--rl.training.checkpoint_freq","2","--rl.training.stop.training_iteration","2","--s3_bucket","sagemaker-us-west-2-123456789012"]
  75. algo-1-vmd10_1  | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
  76. algo-1-vmd10_1  | SM_HP_S3_BUCKET=sagemaker-us-west-2-123456789012
  77. algo-1-vmd10_1  | SM_HP_RL.TRAINING.STOP.TRAINING_ITERATION=2
  78. algo-1-vmd10_1  | SM_HP_RL.TRAINING.CHECKPOINT_FREQ=2
  79. algo-1-vmd10_1  | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python36.zip:/usr/lib/python3.6:/usr/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/dist-packages:/usr/lib/python3/dist-packages
  80. algo-1-vmd10_1  |
  81. algo-1-vmd10_1  | Invoking script with the following command:
  82. algo-1-vmd10_1  |
  83. algo-1-vmd10_1  | /usr/bin/python mod_op_train.py --rl.training.checkpoint_freq 2 --rl.training.stop.training_iteration 2 --s3_bucket sagemaker-us-west-2-123456789012
  84. algo-1-vmd10_1  |
  85. algo-1-vmd10_1  |
  86. algo-1-vmd10_1  | {'monitor': False, 'log_level': 'INFO', 'callbacks': {'on_episode_start': None, 'on_episode_step': None, 'on_episode_end': None, 'on_sample_end': None, 'on_train_result': None}, 'ignore_worker_failures': False, 'model': {'conv_filters': None, 'conv_activation': 'relu', 'fcnet_activation': 'tanh', 'fcnet_hiddens': [256, 256], 'free_log_std': False, 'squash_to_range': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_preprocessor': None, 'custom_model': None, 'custom_options': {}}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'env_config': {}, 'env': None, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'num_workers': 2, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'num_envs_per_worker': 1, 'sample_batch_size': 200, 'train_batch_size': 4000, 'batch_mode': 'truncate_episodes', 'sample_async': False, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_evaluator_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'async_remote_worker_envs': False, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policy_graphs': {}, 'policy_mapping_fn': None, 'policies_to_train': None}, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'num_sgd_iter': 30, 'lr': 5e-05, 'lr_schedule': None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': False, 'straggler_mitigation': False}
  87. algo-1-vmd10_1  | 2019-09-30 07:54:30,715   WARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.
  88. algo-1-vmd10_1  | 2019-09-30 07:54:30,716   INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-30_07-54-30_51/logs.
  89. algo-1-vmd10_1  | 2019-09-30 07:54:30,823   INFO services.py:363 -- Waiting for redis server at 127.0.0.1:45224 to respond...
  90. algo-1-vmd10_1  | 2019-09-30 07:54:30,934   INFO services.py:363 -- Waiting for redis server at 127.0.0.1:39871 to respond...
  91. algo-1-vmd10_1  | 2019-09-30 07:54:30,936   INFO services.py:760 -- Starting Redis shard with 0.83 GB max memory.
  92. algo-1-vmd10_1  | 2019-09-30 07:54:30,951   WARNING services.py:1261 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag '--shm-size' to 'docker run'.
  93. algo-1-vmd10_1  | 2019-09-30 07:54:30,951   INFO services.py:1384 -- Starting the Plasma object store with 1.24 GB memory using /tmp.
  94. algo-1-vmd10_1  | Running experiment with config {
  95. algo-1-vmd10_1  |   "training": {
  96. algo-1-vmd10_1  |     "env": "ArrivalSim-v0",
  97. algo-1-vmd10_1  |     "run": "PPO",
  98. algo-1-vmd10_1  |     "stop": {
  99. algo-1-vmd10_1  |       "training_iteration": 2
  100. algo-1-vmd10_1  |     },
  101. algo-1-vmd10_1  |     "local_dir": "/opt/ml/output/intermediate",
  102. algo-1-vmd10_1  |     "checkpoint_freq": 10,
  103. algo-1-vmd10_1  |     "config": {
  104. algo-1-vmd10_1  |       "num_workers": 1,
  105. algo-1-vmd10_1  |       "train_batch_size": 128,
  106. algo-1-vmd10_1  |       "sample_batch_size": 32,
  107. algo-1-vmd10_1  |       "optimizer": {
  108. algo-1-vmd10_1  |         "grads_per_step": 10
  109. algo-1-vmd10_1  |       }
  110. algo-1-vmd10_1  |     },
  111. algo-1-vmd10_1  |     "checkpoint_at_end": true
  112. algo-1-vmd10_1  |   }
  113. algo-1-vmd10_1  | }
  114. algo-1-vmd10_1  | 2019-09-30 07:54:31,086   INFO tune.py:64 -- Did not find checkpoint file in /opt/ml/output/intermediate/training.
  115. algo-1-vmd10_1  | 2019-09-30 07:54:31,086   INFO tune.py:211 -- Starting a new experiment.
  116. algo-1-vmd10_1  | == Status ==
  117. algo-1-vmd10_1  | Using FIFO scheduling algorithm.
  118. algo-1-vmd10_1  | Resources requested: 0/3 CPUs, 0/0 GPUs
  119. algo-1-vmd10_1  | Memory usage on this node: 1.2/4.1 GB
  120. algo-1-vmd10_1  |
  121. algo-1-vmd10_1  | == Status ==
  122. algo-1-vmd10_1  | Using FIFO scheduling algorithm.
  123. algo-1-vmd10_1  | Resources requested: 2/3 CPUs, 0/0 GPUs
  124. algo-1-vmd10_1  | Memory usage on this node: 1.2/4.1 GB
  125. algo-1-vmd10_1  | Result logdir: /opt/ml/output/intermediate/training
  126. algo-1-vmd10_1  | Number of trials: 1 ({'RUNNING': 1})
  127. algo-1-vmd10_1  | RUNNING trials:
  128. algo-1-vmd10_1  |  - PPO_ArrivalSim-v0_0:   RUNNING
  129. algo-1-vmd10_1  |
  130. algo-1-vmd10_1  | (pid=72) 2019-09-30 07:54:39,765  WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
  131. algo-1-vmd10_1  | (pid=72) 2019-09-30 07:54:39,776  INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  132. algo-1-vmd10_1  | (pid=72) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  133. algo-1-vmd10_1  | (pid=72)   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  134. algo-1-vmd10_1  | (pid=72) 2019-09-30 07:54:40,860  INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
  135. algo-1-vmd10_1  | (pid=72) 2019-09-30 07:54:44,007  INFO ppo.py:105 -- Important! Since 0.7.0, observation normalization is no longer enabled by default. To enable running-mean normalization, set 'observation_filter': 'MeanStdFilter'. You can ignore this message if your environment doesn't require observation normalization.
  136. algo-1-vmd10_1  | (pid=97) 2019-09-30 07:54:48,310  INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
  137. algo-1-vmd10_1  | (pid=97) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  138. algo-1-vmd10_1  | (pid=97)   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  139.  
  140. algo-1-vmd10_1  | (pid=97) price b = 1.5866304593920306
  141. algo-1-vmd10_1  | (pid=97) price a = 1.8366304593920306
  142. algo-1-vmd10_1  | (pid=97) (self.price).shape = (1,)
  143. algo-1-vmd10_1  | (pid=97) [1.83663046] 0.2754945689088046 False {}
  144. algo-1-vmd10_1  | (pid=97) price b = 1.8366304593920306
  145. algo-1-vmd10_1  | (pid=97) price a = 1.5866304593920306
  146. algo-1-vmd10_1  | (pid=97) (self.price).shape = (1,)
  147. algo-1-vmd10_1  | (pid=97) [1.58663046] 0.2538608735027249 False {}
  148. algo-1-vmd10_1  | (pid=97) price b = 1.5866304593920306
  149. algo-1-vmd10_1  | (pid=97) price a = 1.8366304593920306
  150. algo-1-vmd10_1  | (pid=97) (self.price).shape = (1,)
  151. algo-1-vmd10_1  | (pid=97) [1.83663046] 0.23876195972096398 False {}
  152.  
  153. algo-1-vmd10_1  | Result for PPO_ArrivalSim-v0_0:
  154. algo-1-vmd10_1  |   custom_metrics: {}
  155. algo-1-vmd10_1  |   date: 2019-09-30_07-54-52
  156. algo-1-vmd10_1  |   done: true
  157. algo-1-vmd10_1  |   episode_len_mean: 13.9375
  158. algo-1-vmd10_1  |   episode_reward_max: 15.623709832898886
  159. algo-1-vmd10_1  |   episode_reward_mean: 2.1431919362241683
  160. algo-1-vmd10_1  |   episode_reward_min: 0.0
  161. algo-1-vmd10_1  |   episodes_this_iter: 8
  162. algo-1-vmd10_1  |   episodes_total: 16
  163. algo-1-vmd10_1  |   experiment_id: e401acafb745453a93cd07c23a49719a
  164. algo-1-vmd10_1  |   hostname: 912222cf3d36
  165. algo-1-vmd10_1  |   info:
  166. algo-1-vmd10_1  |     default:
  167. algo-1-vmd10_1  |       cur_kl_coeff: 0.30000001192092896
  168. algo-1-vmd10_1  |       cur_lr: 4.999999873689376e-05
  169. algo-1-vmd10_1  |       entropy: 1.0779298543930054
  170. algo-1-vmd10_1  |       kl: 3.3599588871002197
  171. algo-1-vmd10_1  |       policy_loss: -0.00849771499633789
  172. algo-1-vmd10_1  |       total_loss: 9.052791595458984
  173. algo-1-vmd10_1  |       vf_explained_var: 0.06744426488876343
  174. algo-1-vmd10_1  |       vf_loss: 8.053301811218262
  175. algo-1-vmd10_1  |     grad_time_ms: 1050.643
  176. algo-1-vmd10_1  |     load_time_ms: 34.049
  177. algo-1-vmd10_1  |     num_steps_sampled: 256
  178. algo-1-vmd10_1  |     num_steps_trained: 256
  179. algo-1-vmd10_1  |     sample_time_ms: 2921.621
  180. algo-1-vmd10_1  |     update_time_ms: 214.194
  181. algo-1-vmd10_1  |   iterations_since_restore: 2
  182. algo-1-vmd10_1  |   node_ip: 172.18.0.2
  183. algo-1-vmd10_1  |   num_healthy_workers: 1
  184. algo-1-vmd10_1  |   num_metric_batches_dropped: 0
  185. algo-1-vmd10_1  |   off_policy_estimator: {}
  186. algo-1-vmd10_1  |   pid: 72
  187. algo-1-vmd10_1  |   policy_reward_mean: {}
  188. algo-1-vmd10_1  |   time_since_restore: 8.488733053207397
  189. algo-1-vmd10_1  |   time_this_iter_s: 1.0875024795532227
  190. algo-1-vmd10_1  |   time_total_s: 8.488733053207397
  191. algo-1-vmd10_1  |   timestamp: 1569830092
  192. algo-1-vmd10_1  |   timesteps_since_restore: 256
  193. algo-1-vmd10_1  |   timesteps_this_iter: 128
  194. algo-1-vmd10_1  |   timesteps_total: 256
  195. algo-1-vmd10_1  |   training_iteration: 2
  196. algo-1-vmd10_1  |
  197. algo-1-vmd10_1  | 2019-09-30 07:54:52,557   INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
  198. algo-1-vmd10_1  | == Status ==
  199. algo-1-vmd10_1  | Using FIFO scheduling algorithm.
  200. algo-1-vmd10_1  | Resources requested: 0/3 CPUs, 0/0 GPUs
  201. algo-1-vmd10_1  | Memory usage on this node: 2.0/4.1 GB
  202. algo-1-vmd10_1  | Result logdir: /opt/ml/output/intermediate/training
  203. algo-1-vmd10_1  | Number of trials: 1 ({'TERMINATED': 1})
  204. algo-1-vmd10_1  | TERMINATED trials:
  205. algo-1-vmd10_1  |  - PPO_ArrivalSim-v0_0:   TERMINATED, [2 CPUs, 0 GPUs], [pid=72], 8 s, 2 iter, 256 ts, 2.14 rew
  206. algo-1-vmd10_1  |
  207. algo-1-vmd10_1  | Saved model configuration.
  208. algo-1-vmd10_1  | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2 as /opt/ml/model/checkpoint
  209. algo-1-vmd10_1  | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2.tune_metadata as /opt/ml/model/checkpoint.tune_metadata
  210. algo-1-vmd10_1  | 2019-09-30 07:54:57,605   WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
  211. algo-1-vmd10_1  | 2019-09-30 07:54:57,607   INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  212. algo-1-vmd10_1  | /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  213. algo-1-vmd10_1  |   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  214. algo-1-vmd10_1  | 2019-09-30 07:54:58,769   INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
  215. algo-1-vmd10_1  | (pid=73) 2019-09-30 07:54:59,065  INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
  216. algo-1-vmd10_1  | (pid=73) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  217. algo-1-vmd10_1  | (pid=73)   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  218. algo-1-vmd10_1  | Saved TensorFlow serving model!
  219. algo-1-vmd10_1  | 2019-09-30 07:55:03,775 sagemaker-containers INFO     Reporting training SUCCESS
  220. tmp8z_8haic_algo-1-vmd10_1 exited with code 0
  221. Aborting on container exit...
  222. ---------------------------------------------------------------------------
  223. PermissionError                           Traceback (most recent call last)
  224. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
  225.      28         try:
  226. ---> 29             fsrc = open(src, 'rb')
  227.      30         except OSError as e:
  228.  
  229. PermissionError: [Errno 13] Permission denied: '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log'
  230.  
  231. During handling of the above exception, another exception occurred:
  232.  
  233. DistutilsFileError                        Traceback (most recent call last)
  234. <ipython-input-5-abacdc7913fc> in <module>()
  235.      34                     )
  236.      35
  237. ---> 36 estimator.fit()
  238.  
  239. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
  240.     337         self._prepare_for_training(job_name=job_name)
  241.     338
  242. --> 339         self.latest_training_job = _TrainingJob.start_new(self, inputs)
  243.     340         if wait:
  244.     341             self.latest_training_job.wait(logs=logs)
  245.  
  246. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs)
  247.     856         cls._add_spot_checkpoint_args(local_mode, estimator, train_args)
  248.     857
  249. --> 858         estimator.sagemaker_session.train(**train_args)
  250.     859
  251.     860         return cls(estimator.sagemaker_session, estimator._current_job_name)
  252.  
  253. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)
  254.     390         LOGGER.info("Creating training-job with name: %s", job_name)
  255.     391         LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
  256. --> 392         self.sagemaker_client.create_training_job(**train_request)
  257.     393
  258.     394     def compile_model(
  259.  
  260. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
  261.      99         training_job = _LocalTrainingJob(container)
  262.     100         hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
  263. --> 101         training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
  264.     102
  265.     103         LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
  266.  
  267. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
  268.      87
  269.      88         self.model_artifacts = self.container.train(
  270. ---> 89             input_data_config, output_data_config, hyperparameters, job_name
  271.      90         )
  272.      91         self.end_time = datetime.datetime.now()
  273.  
  274. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
  275.     153             raise RuntimeError(msg)
  276.     154         finally:
  277. --> 155             artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
  278.     156
  279.     157             # free up the training data directory as it may contain
  280.  
  281. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in retrieve_artifacts(self, compose_data, output_data_config, job_name)
  282.     253                     sagemaker.local.utils.recursive_copy(host_dir, model_artifacts)
  283.     254                 elif container_dir == "/opt/ml/output":
  284. --> 255                     sagemaker.local.utils.recursive_copy(host_dir, output_artifacts)
  285.     256
  286.     257         # Tar Artifacts -> model.tar.gz and output.tar.gz
  287.  
  288. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/utils.py in recursive_copy(source, destination)
  289.      82     """
  290.      83     if os.path.isdir(source):
  291. ---> 84         copy_tree(source, destination)
  292.  
  293. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  294.     157                 copy_tree(src_name, dst_name, preserve_mode,
  295.     158                           preserve_times, preserve_symlinks, update,
  296. --> 159                           verbose=verbose, dry_run=dry_run))
  297.     160         else:
  298.     161             copy_file(src_name, dst_name, preserve_mode,
  299.  
  300. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  301.     157                 copy_tree(src_name, dst_name, preserve_mode,
  302.     158                           preserve_times, preserve_symlinks, update,
  303. --> 159                           verbose=verbose, dry_run=dry_run))
  304.     160         else:
  305.     161             copy_file(src_name, dst_name, preserve_mode,
  306.  
  307. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  308.     157                 copy_tree(src_name, dst_name, preserve_mode,
  309.     158                           preserve_times, preserve_symlinks, update,
  310. --> 159                           verbose=verbose, dry_run=dry_run))
  311.     160         else:
  312.     161             copy_file(src_name, dst_name, preserve_mode,
  313.  
  314. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
  315.     161             copy_file(src_name, dst_name, preserve_mode,
  316.     162                       preserve_times, update, verbose=verbose,
  317. --> 163                       dry_run=dry_run)
  318.     164             outputs.append(dst_name)
  319.     165
  320.  
  321. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in copy_file(src, dst, preserve_mode, preserve_times, update, link, verbose, dry_run)
  322.     149     # Otherwise (non-Mac, not linking), copy the file contents and
  323.     150     # (optionally) copy the times and mode.
  324. --> 151     _copy_file_contents(src, dst)
  325.     152     if preserve_mode or preserve_times:
  326.     153         st = os.stat(src)
  327.  
  328. ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
  329.      29             fsrc = open(src, 'rb')
  330.      30         except OSError as e:
  331. ---> 31             raise DistutilsFileError("could not open '%s': %s" % (src, e.strerror))
  332.      32
  333.      33         if os.path.exists(dst):
  334.  
  335. DistutilsFileError: could not open '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log': Permission denied
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top