Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import argparse
- import gym
- import numpy as np
- import os
- import tensorflow as tf
- import time
- import pickle
- from tensorforce.agents import PPOAgent
- from rl_models.agents import *
- def parse_args():
- parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
- # Environment
- parser.add_argument("--scenario", type=str, default="vip_rl", help="name of the scenario script")
- parser.add_argument("--max-episode-len", type=int, default=200, help="maximum episode length")
- parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes")
- parser.add_argument("--agent-type", type=str, default="ppo", help="policy for bodyguards")
- # Checkpointing
- parser.add_argument("--exp-name", type=str, default=None, help="name of the experiment")
- parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved")
- parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed")
- parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded")
- # Evaluation
- parser.add_argument("--restore", action="store_true", default=False)
- parser.add_argument("--display", action="store_true", default=True)
- parser.add_argument("--benchmark", action="store_true", default=False)
- parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking")
- parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved")
- parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved")
- return parser.parse_args()
- def make_env(scenario_name):
- from multiagent.environment import MultiAgentEnv
- import multiagent.scenarios as scenarios
- # load scenario from script
- scenario = scenarios.load(scenario_name + ".py").Scenario()
- # create world
- world = scenario.make_world()
- # create multiagent environment
- return MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.info, scenario.done)
- def get_trainers(rl_agent, observation_space_dimension, action_space_dimension, number_of_agents):
- trainers = []
- for i in range(number_of_agents):
- agent = DistributedTrainer(rl_agent+".json",observation_space_dimension[i], action_space_dimension[i])
- trainers.append(agent)
- return trainers
- def train(arglist):
- env = make_env(arglist.scenario)
- obs_shape_n = [env.observation_space[i] for i in range(env.n)]
- action_shape_n = [env.action_space[i] for i in range(env.n)]
- trainers = get_trainers(arglist.agent_type, obs_shape_n, action_shape_n, env.n)
- obs_n = env.reset()
- action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
- episode_rewards = [0.0] # sum of rewards for all agents
- agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward
- final_ep_rewards = [] # sum of rewards for training curve
- final_ep_ag_rewards = [] # agent rewards for training curve
- agent_info = [[[]]] # placeholder for benchmarking info
- obs_n = env.reset()
- episode_step = 0
- train_step = 0
- t_start = time.time()
- print('Starting iterations...')
- while(True):
- action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
- # environment step
- new_obs_n, rew_n, done_n, info_n = env.step(action_n)
- episode_step += 1
- done = all(done_n)
- terminal = (episode_step >= arglist.max_episode_len)
- obs_n = new_obs_n
- for i, rew in enumerate(rew_n):
- episode_rewards[-1] += rew
- agent_rewards[i][-1] += rew
- # for i, agent in enumerate(trainers):
- # agent.update(rew_n[i], terminal)
- if done or terminal:
- for i, agent in enumerate(trainers):
- agent.update(rew_n[i], done or terminal)
- obs_n = env.reset()
- tf.summary.scalar('episode_reward', tf.constant(episode_rewards[-1]))
- episode_step = 0
- episode_rewards.append(0)
- for a in agent_rewards:
- a.append(0)
- agent_info.append([[]])
- # for displaying learned policies
- if arglist.display:
- env.render()
- continue
- if terminal and (len(episode_rewards) % arglist.save_rate == 0):
- print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
- [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
- t_start = time.time()
- # Keep track of final episode reward
- final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
- for rew in agent_rewards:
- final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
- if len(episode_rewards) > arglist.num_episodes:
- rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
- with open(rew_file_name, 'wb') as fp:
- pickle.dump(final_ep_rewards, fp)
- agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
- with open(agrew_file_name, 'wb') as fp:
- pickle.dump(final_ep_ag_rewards, fp)
- print('...Finished total of {} episodes.'.format(len(episode_rewards)))
- break
- if __name__ == '__main__':
- arglist = parse_args()
- train(arglist)
Add Comment
Please, Sign In to add comment