Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Creating tmp8z_8haic_algo-1-vmd10_1 ...
- Attaching to tmp8z_8haic_algo-1-vmd10_12mdone
- algo-1-vmd10_1 | 2019-09-30 07:54:22,885 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training
- algo-1-vmd10_1 | 2019-09-30 07:54:22,892 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
- algo-1-vmd10_1 | 2019-09-30 07:54:23,011 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
- algo-1-vmd10_1 | 2019-09-30 07:54:23,031 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)
- algo-1-vmd10_1 | 2019-09-30 07:54:23,045 sagemaker-containers INFO Invoking user script
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | Training Env:
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | {
- algo-1-vmd10_1 | "additional_framework_parameters": {
- algo-1-vmd10_1 | "sagemaker_estimator": "RLEstimator"
- algo-1-vmd10_1 | },
- algo-1-vmd10_1 | "channel_input_dirs": {},
- algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
- algo-1-vmd10_1 | "framework_module": "sagemaker_tensorflow_container.training:main",
- algo-1-vmd10_1 | "hosts": [
- algo-1-vmd10_1 | "algo-1-vmd10"
- algo-1-vmd10_1 | ],
- algo-1-vmd10_1 | "hyperparameters": {
- algo-1-vmd10_1 | "s3_bucket": "sagemaker-us-west-2-123456789012",
- algo-1-vmd10_1 | "rl.training.stop.training_iteration": 2,
- algo-1-vmd10_1 | "rl.training.checkpoint_freq": 2
- algo-1-vmd10_1 | },
- algo-1-vmd10_1 | "input_config_dir": "/opt/ml/input/config",
- algo-1-vmd10_1 | "input_data_config": {},
- algo-1-vmd10_1 | "input_dir": "/opt/ml/input",
- algo-1-vmd10_1 | "is_master": true,
- algo-1-vmd10_1 | "job_name": "ArrivalSim-2019-09-30-07-53-33-200",
- algo-1-vmd10_1 | "log_level": 20,
- algo-1-vmd10_1 | "master_hostname": "algo-1-vmd10",
- algo-1-vmd10_1 | "model_dir": "/opt/ml/model",
- algo-1-vmd10_1 | "module_dir": "s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz",
- algo-1-vmd10_1 | "module_name": "mod_op_train",
- algo-1-vmd10_1 | "network_interface_name": "eth0",
- algo-1-vmd10_1 | "num_cpus": 2,
- algo-1-vmd10_1 | "num_gpus": 0,
- algo-1-vmd10_1 | "output_data_dir": "/opt/ml/output/data",
- algo-1-vmd10_1 | "output_dir": "/opt/ml/output",
- algo-1-vmd10_1 | "output_intermediate_dir": "/opt/ml/output/intermediate",
- algo-1-vmd10_1 | "resource_config": {
- algo-1-vmd10_1 | "current_host": "algo-1-vmd10",
- algo-1-vmd10_1 | "hosts": [
- algo-1-vmd10_1 | "algo-1-vmd10"
- algo-1-vmd10_1 | ]
- algo-1-vmd10_1 | },
- algo-1-vmd10_1 | "user_entry_point": "mod_op_train.py"
- algo-1-vmd10_1 | }
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | Environment variables:
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | SM_HOSTS=["algo-1-vmd10"]
- algo-1-vmd10_1 | SM_NETWORK_INTERFACE_NAME=eth0
- algo-1-vmd10_1 | SM_HPS={"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"}
- algo-1-vmd10_1 | SM_USER_ENTRY_POINT=mod_op_train.py
- algo-1-vmd10_1 | SM_FRAMEWORK_PARAMS={"sagemaker_estimator":"RLEstimator"}
- algo-1-vmd10_1 | SM_RESOURCE_CONFIG={"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]}
- algo-1-vmd10_1 | SM_INPUT_DATA_CONFIG={}
- algo-1-vmd10_1 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
- algo-1-vmd10_1 | SM_CHANNELS=[]
- algo-1-vmd10_1 | SM_CURRENT_HOST=algo-1-vmd10
- algo-1-vmd10_1 | SM_MODULE_NAME=mod_op_train
- algo-1-vmd10_1 | SM_LOG_LEVEL=20
- algo-1-vmd10_1 | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main
- algo-1-vmd10_1 | SM_INPUT_DIR=/opt/ml/input
- algo-1-vmd10_1 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
- algo-1-vmd10_1 | SM_OUTPUT_DIR=/opt/ml/output
- algo-1-vmd10_1 | SM_NUM_CPUS=2
- algo-1-vmd10_1 | SM_NUM_GPUS=0
- algo-1-vmd10_1 | SM_MODEL_DIR=/opt/ml/model
- algo-1-vmd10_1 | SM_MODULE_DIR=s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz
- algo-1-vmd10_1 | SM_TRAINING_ENV={"additional_framework_parameters":{"sagemaker_estimator":"RLEstimator"},"channel_input_dirs":{},"current_host":"algo-1-vmd10","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-vmd10"],"hyperparameters":{"rl.training.checkpoint_freq":2,"rl.training.stop.training_iteration":2,"s3_bucket":"sagemaker-us-west-2-123456789012"},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"ArrivalSim-2019-09-30-07-53-33-200","log_level":20,"master_hostname":"algo-1-vmd10","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-west-2-123456789012/ArrivalSim-2019-09-30-07-53-33-200/source/sourcedir.tar.gz","module_name":"mod_op_train","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-vmd10","hosts":["algo-1-vmd10"]},"user_entry_point":"mod_op_train.py"}
- algo-1-vmd10_1 | SM_USER_ARGS=["--rl.training.checkpoint_freq","2","--rl.training.stop.training_iteration","2","--s3_bucket","sagemaker-us-west-2-123456789012"]
- algo-1-vmd10_1 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
- algo-1-vmd10_1 | SM_HP_S3_BUCKET=sagemaker-us-west-2-123456789012
- algo-1-vmd10_1 | SM_HP_RL.TRAINING.STOP.TRAINING_ITERATION=2
- algo-1-vmd10_1 | SM_HP_RL.TRAINING.CHECKPOINT_FREQ=2
- algo-1-vmd10_1 | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python36.zip:/usr/lib/python3.6:/usr/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/dist-packages:/usr/lib/python3/dist-packages
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | Invoking script with the following command:
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | /usr/bin/python mod_op_train.py --rl.training.checkpoint_freq 2 --rl.training.stop.training_iteration 2 --s3_bucket sagemaker-us-west-2-123456789012
- algo-1-vmd10_1 |
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | {'monitor': False, 'log_level': 'INFO', 'callbacks': {'on_episode_start': None, 'on_episode_step': None, 'on_episode_end': None, 'on_sample_end': None, 'on_train_result': None}, 'ignore_worker_failures': False, 'model': {'conv_filters': None, 'conv_activation': 'relu', 'fcnet_activation': 'tanh', 'fcnet_hiddens': [256, 256], 'free_log_std': False, 'squash_to_range': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_preprocessor': None, 'custom_model': None, 'custom_options': {}}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'env_config': {}, 'env': None, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'num_workers': 2, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 'num_cpus_for_driver': 1, 'num_envs_per_worker': 1, 'sample_batch_size': 200, 'train_batch_size': 4000, 'batch_mode': 'truncate_episodes', 'sample_async': False, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_evaluator_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False, 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 'async_remote_worker_envs': False, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policy_graphs': {}, 'policy_mapping_fn': None, 'policies_to_train': None}, 'use_gae': True, 'lambda': 1.0, 'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'num_sgd_iter': 30, 'lr': 5e-05, 'lr_schedule': None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': False, 'straggler_mitigation': False}
- algo-1-vmd10_1 | 2019-09-30 07:54:30,715 WARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.
- algo-1-vmd10_1 | 2019-09-30 07:54:30,716 INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-30_07-54-30_51/logs.
- algo-1-vmd10_1 | 2019-09-30 07:54:30,823 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:45224 to respond...
- algo-1-vmd10_1 | 2019-09-30 07:54:30,934 INFO services.py:363 -- Waiting for redis server at 127.0.0.1:39871 to respond...
- algo-1-vmd10_1 | 2019-09-30 07:54:30,936 INFO services.py:760 -- Starting Redis shard with 0.83 GB max memory.
- algo-1-vmd10_1 | 2019-09-30 07:54:30,951 WARNING services.py:1261 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance! You may be able to free up space by deleting files in /dev/shm or terminating any running plasma_store_server processes. If you are inside a Docker container, you may need to pass an argument with the flag '--shm-size' to 'docker run'.
- algo-1-vmd10_1 | 2019-09-30 07:54:30,951 INFO services.py:1384 -- Starting the Plasma object store with 1.24 GB memory using /tmp.
- algo-1-vmd10_1 | Running experiment with config {
- algo-1-vmd10_1 | "training": {
- algo-1-vmd10_1 | "env": "ArrivalSim-v0",
- algo-1-vmd10_1 | "run": "PPO",
- algo-1-vmd10_1 | "stop": {
- algo-1-vmd10_1 | "training_iteration": 2
- algo-1-vmd10_1 | },
- algo-1-vmd10_1 | "local_dir": "/opt/ml/output/intermediate",
- algo-1-vmd10_1 | "checkpoint_freq": 10,
- algo-1-vmd10_1 | "config": {
- algo-1-vmd10_1 | "num_workers": 1,
- algo-1-vmd10_1 | "train_batch_size": 128,
- algo-1-vmd10_1 | "sample_batch_size": 32,
- algo-1-vmd10_1 | "optimizer": {
- algo-1-vmd10_1 | "grads_per_step": 10
- algo-1-vmd10_1 | }
- algo-1-vmd10_1 | },
- algo-1-vmd10_1 | "checkpoint_at_end": true
- algo-1-vmd10_1 | }
- algo-1-vmd10_1 | }
- algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:64 -- Did not find checkpoint file in /opt/ml/output/intermediate/training.
- algo-1-vmd10_1 | 2019-09-30 07:54:31,086 INFO tune.py:211 -- Starting a new experiment.
- algo-1-vmd10_1 | == Status ==
- algo-1-vmd10_1 | Using FIFO scheduling algorithm.
- algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
- algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | == Status ==
- algo-1-vmd10_1 | Using FIFO scheduling algorithm.
- algo-1-vmd10_1 | Resources requested: 2/3 CPUs, 0/0 GPUs
- algo-1-vmd10_1 | Memory usage on this node: 1.2/4.1 GB
- algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
- algo-1-vmd10_1 | Number of trials: 1 ({'RUNNING': 1})
- algo-1-vmd10_1 | RUNNING trials:
- algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: RUNNING
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,765 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
- algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:39,776 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
- algo-1-vmd10_1 | (pid=72) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
- algo-1-vmd10_1 | (pid=72) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
- algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:40,860 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
- algo-1-vmd10_1 | (pid=72) 2019-09-30 07:54:44,007 INFO ppo.py:105 -- Important! Since 0.7.0, observation normalization is no longer enabled by default. To enable running-mean normalization, set 'observation_filter': 'MeanStdFilter'. You can ignore this message if your environment doesn't require observation normalization.
- algo-1-vmd10_1 | (pid=97) 2019-09-30 07:54:48,310 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
- algo-1-vmd10_1 | (pid=97) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
- algo-1-vmd10_1 | (pid=97) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
- algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
- algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
- algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
- algo-1-vmd10_1 | (pid=97) [1.83663046] 0.2754945689088046 False {}
- algo-1-vmd10_1 | (pid=97) price b = 1.8366304593920306
- algo-1-vmd10_1 | (pid=97) price a = 1.5866304593920306
- algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
- algo-1-vmd10_1 | (pid=97) [1.58663046] 0.2538608735027249 False {}
- algo-1-vmd10_1 | (pid=97) price b = 1.5866304593920306
- algo-1-vmd10_1 | (pid=97) price a = 1.8366304593920306
- algo-1-vmd10_1 | (pid=97) (self.price).shape = (1,)
- algo-1-vmd10_1 | (pid=97) [1.83663046] 0.23876195972096398 False {}
- algo-1-vmd10_1 | Result for PPO_ArrivalSim-v0_0:
- algo-1-vmd10_1 | custom_metrics: {}
- algo-1-vmd10_1 | date: 2019-09-30_07-54-52
- algo-1-vmd10_1 | done: true
- algo-1-vmd10_1 | episode_len_mean: 13.9375
- algo-1-vmd10_1 | episode_reward_max: 15.623709832898886
- algo-1-vmd10_1 | episode_reward_mean: 2.1431919362241683
- algo-1-vmd10_1 | episode_reward_min: 0.0
- algo-1-vmd10_1 | episodes_this_iter: 8
- algo-1-vmd10_1 | episodes_total: 16
- algo-1-vmd10_1 | experiment_id: e401acafb745453a93cd07c23a49719a
- algo-1-vmd10_1 | hostname: 912222cf3d36
- algo-1-vmd10_1 | info:
- algo-1-vmd10_1 | default:
- algo-1-vmd10_1 | cur_kl_coeff: 0.30000001192092896
- algo-1-vmd10_1 | cur_lr: 4.999999873689376e-05
- algo-1-vmd10_1 | entropy: 1.0779298543930054
- algo-1-vmd10_1 | kl: 3.3599588871002197
- algo-1-vmd10_1 | policy_loss: -0.00849771499633789
- algo-1-vmd10_1 | total_loss: 9.052791595458984
- algo-1-vmd10_1 | vf_explained_var: 0.06744426488876343
- algo-1-vmd10_1 | vf_loss: 8.053301811218262
- algo-1-vmd10_1 | grad_time_ms: 1050.643
- algo-1-vmd10_1 | load_time_ms: 34.049
- algo-1-vmd10_1 | num_steps_sampled: 256
- algo-1-vmd10_1 | num_steps_trained: 256
- algo-1-vmd10_1 | sample_time_ms: 2921.621
- algo-1-vmd10_1 | update_time_ms: 214.194
- algo-1-vmd10_1 | iterations_since_restore: 2
- algo-1-vmd10_1 | node_ip: 172.18.0.2
- algo-1-vmd10_1 | num_healthy_workers: 1
- algo-1-vmd10_1 | num_metric_batches_dropped: 0
- algo-1-vmd10_1 | off_policy_estimator: {}
- algo-1-vmd10_1 | pid: 72
- algo-1-vmd10_1 | policy_reward_mean: {}
- algo-1-vmd10_1 | time_since_restore: 8.488733053207397
- algo-1-vmd10_1 | time_this_iter_s: 1.0875024795532227
- algo-1-vmd10_1 | time_total_s: 8.488733053207397
- algo-1-vmd10_1 | timestamp: 1569830092
- algo-1-vmd10_1 | timesteps_since_restore: 256
- algo-1-vmd10_1 | timesteps_this_iter: 128
- algo-1-vmd10_1 | timesteps_total: 256
- algo-1-vmd10_1 | training_iteration: 2
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | 2019-09-30 07:54:52,557 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
- algo-1-vmd10_1 | == Status ==
- algo-1-vmd10_1 | Using FIFO scheduling algorithm.
- algo-1-vmd10_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
- algo-1-vmd10_1 | Memory usage on this node: 2.0/4.1 GB
- algo-1-vmd10_1 | Result logdir: /opt/ml/output/intermediate/training
- algo-1-vmd10_1 | Number of trials: 1 ({'TERMINATED': 1})
- algo-1-vmd10_1 | TERMINATED trials:
- algo-1-vmd10_1 | - PPO_ArrivalSim-v0_0: TERMINATED, [2 CPUs, 0 GPUs], [pid=72], 8 s, 2 iter, 256 ts, 2.14 rew
- algo-1-vmd10_1 |
- algo-1-vmd10_1 | Saved model configuration.
- algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2 as /opt/ml/model/checkpoint
- algo-1-vmd10_1 | Saved the checkpoint file /opt/ml/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/checkpoint_2/checkpoint-2.tune_metadata as /opt/ml/model/checkpoint.tune_metadata
- algo-1-vmd10_1 | 2019-09-30 07:54:57,605 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
- algo-1-vmd10_1 | 2019-09-30 07:54:57,607 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
- algo-1-vmd10_1 | /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
- algo-1-vmd10_1 | "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
- algo-1-vmd10_1 | 2019-09-30 07:54:58,769 INFO multi_gpu_optimizer.py:74 -- LocalMultiGPUOptimizer devices ['/cpu:0']
- algo-1-vmd10_1 | (pid=73) 2019-09-30 07:54:59,065 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
- algo-1-vmd10_1 | (pid=73) /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
- algo-1-vmd10_1 | (pid=73) "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
- algo-1-vmd10_1 | Saved TensorFlow serving model!
- algo-1-vmd10_1 | 2019-09-30 07:55:03,775 sagemaker-containers INFO Reporting training SUCCESS
- tmp8z_8haic_algo-1-vmd10_1 exited with code 0
- Aborting on container exit...
- ---------------------------------------------------------------------------
- PermissionError Traceback (most recent call last)
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
- 28 try:
- ---> 29 fsrc = open(src, 'rb')
- 30 except OSError as e:
- PermissionError: [Errno 13] Permission denied: '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log'
- During handling of the above exception, another exception occurred:
- DistutilsFileError Traceback (most recent call last)
- <ipython-input-5-abacdc7913fc> in <module>()
- 34 )
- 35
- ---> 36 estimator.fit()
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name)
- 337 self._prepare_for_training(job_name=job_name)
- 338
- --> 339 self.latest_training_job = _TrainingJob.start_new(self, inputs)
- 340 if wait:
- 341 self.latest_training_job.wait(logs=logs)
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs)
- 856 cls._add_spot_checkpoint_args(local_mode, estimator, train_args)
- 857
- --> 858 estimator.sagemaker_session.train(**train_args)
- 859
- 860 return cls(estimator.sagemaker_session, estimator._current_job_name)
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)
- 390 LOGGER.info("Creating training-job with name: %s", job_name)
- 391 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
- --> 392 self.sagemaker_client.create_training_job(**train_request)
- 393
- 394 def compile_model(
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
- 99 training_job = _LocalTrainingJob(container)
- 100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
- --> 101 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
- 102
- 103 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
- 87
- 88 self.model_artifacts = self.container.train(
- ---> 89 input_data_config, output_data_config, hyperparameters, job_name
- 90 )
- 91 self.end_time = datetime.datetime.now()
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
- 153 raise RuntimeError(msg)
- 154 finally:
- --> 155 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
- 156
- 157 # free up the training data directory as it may contain
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/image.py in retrieve_artifacts(self, compose_data, output_data_config, job_name)
- 253 sagemaker.local.utils.recursive_copy(host_dir, model_artifacts)
- 254 elif container_dir == "/opt/ml/output":
- --> 255 sagemaker.local.utils.recursive_copy(host_dir, output_artifacts)
- 256
- 257 # Tar Artifacts -> model.tar.gz and output.tar.gz
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/sagemaker/local/utils.py in recursive_copy(source, destination)
- 82 """
- 83 if os.path.isdir(source):
- ---> 84 copy_tree(source, destination)
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
- 157 copy_tree(src_name, dst_name, preserve_mode,
- 158 preserve_times, preserve_symlinks, update,
- --> 159 verbose=verbose, dry_run=dry_run))
- 160 else:
- 161 copy_file(src_name, dst_name, preserve_mode,
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
- 157 copy_tree(src_name, dst_name, preserve_mode,
- 158 preserve_times, preserve_symlinks, update,
- --> 159 verbose=verbose, dry_run=dry_run))
- 160 else:
- 161 copy_file(src_name, dst_name, preserve_mode,
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
- 157 copy_tree(src_name, dst_name, preserve_mode,
- 158 preserve_times, preserve_symlinks, update,
- --> 159 verbose=verbose, dry_run=dry_run))
- 160 else:
- 161 copy_file(src_name, dst_name, preserve_mode,
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/dir_util.py in copy_tree(src, dst, preserve_mode, preserve_times, preserve_symlinks, update, verbose, dry_run)
- 161 copy_file(src_name, dst_name, preserve_mode,
- 162 preserve_times, update, verbose=verbose,
- --> 163 dry_run=dry_run)
- 164 outputs.append(dst_name)
- 165
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in copy_file(src, dst, preserve_mode, preserve_times, update, link, verbose, dry_run)
- 149 # Otherwise (non-Mac, not linking), copy the file contents and
- 150 # (optionally) copy the times and mode.
- --> 151 _copy_file_contents(src, dst)
- 152 if preserve_mode or preserve_times:
- 153 st = os.stat(src)
- ~/anaconda3/envs/tensorflow_p36/lib/python3.6/distutils/file_util.py in _copy_file_contents(src, dst, buffer_size)
- 29 fsrc = open(src, 'rb')
- 30 except OSError as e:
- ---> 31 raise DistutilsFileError("could not open '%s': %s" % (src, e.strerror))
- 32
- 33 if os.path.exists(dst):
- DistutilsFileError: could not open '/tmp/tmp8z_8haic/algo-1-vmd10/output/intermediate/training/PPO_ArrivalSim-v0_0_2019-09-30_07-54-31mi7pqqk2/log_sync6vobqpa5.log': Permission denied
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement