Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from stable_baselines3 import PPO
- from stable_baselines3.common.callbacks import CheckpointCallback
- #from pettingzoo.butterfly import pistonball_v4
- import supersuit as ss
- from ray import tune
- from ray.tune.suggest.optuna import OptunaSearch
- import optuna
- import os
- import ray
- from pathlib import Path
- import gym
- from ray.tune.suggest import ConcurrencyLimiter
- space = {
- "n_epochs": optuna.distributions.IntUniformDistribution(3, 50),
- "gamma": optuna.distributions.LogUniformDistribution(.9, .999),
- "ent_coef": optuna.distributions.LogUniformDistribution(.001, .1),
- "learning_rate": optuna.distributions.LogUniformDistribution(5e-6, 5e-4),
- "vf_coef": optuna.distributions.UniformDistribution(.1, 1),
- "gae_lambda": optuna.distributions.UniformDistribution(.8, 1),
- "max_grad_norm": optuna.distributions.LogUniformDistribution(.01, 10),
- "n_steps": optuna.distributions.CategoricalDistribution([128, 256, 512, 1024, 2048, 4096]),
- "batch_size": optuna.distributions.CategoricalDistribution([32, 64, 128, 256]), # , 512, 1024, 2048, 4096
- "n_envs": optuna.distributions.CategoricalDistribution([2, 4, 8]),
- "clip_range": optuna.distributions.UniformDistribution(.1, 5),
- }
- optuna_search = OptunaSearch(
- space,
- metric="mean_reward",
- mode="max")
- def make_env(n_envs):
- if n_envs is None:
- env = gym.make('LunarLanderContinuous-v2')
- else:
- env = gym.make('LunarLanderContinuous-v2')
- env = ss.stable_baselines3_vec_env_v0(env, n_envs, multiprocessing=False)
- return env
- def evaluate_all_policies(name):
- def evaluate_policy(env, model):
- total_reward = 0
- NUM_RESETS = 100
- for i in range(NUM_RESETS):
- done = False
- obs = env.reset()
- while not done:
- act = model.predict(obs, deterministic=True)[0] if not done else None
- observation, reward, done, info = env.step(act)
- total_reward += reward
- return total_reward/NUM_RESETS
- env = make_env(None)
- policy_folder = str(Path.home())+'/policy_logs/'+name+'/'
- policy_files = os.listdir(policy_folder)
- policy_file = sorted(policy_files, key=lambda x: int(x[9:-10]))[-1]
- model = PPO.load(policy_folder+policy_file)
- return evaluate_policy(env, model)
- def gen_filename(params):
- name = ''
- keys = list(params.keys())
- for key in keys:
- name = name+key+'_'+str(params[key])[0:5]+'_'
- name = name[0:-1] # removes trailing _
- return name.replace('.', '')
- def train(parameterization):
- name = gen_filename(parameterization)
- folder = str(Path.home())+'/policy_logs/'+name+'/'
- checkpoint_callback = CheckpointCallback(save_freq=400, save_path=folder)
- env = make_env(parameterization['n_envs'])
- model = PPO("MlpPolicy", env, gamma=parameterization['gamma'], n_steps=parameterization['n_steps'], ent_coef=parameterization['ent_coef'], learning_rate=parameterization['learning_rate'], vf_coef=parameterization['vf_coef'], max_grad_norm=parameterization['max_grad_norm'], gae_lambda=parameterization['gae_lambda'], batch_size=parameterization['batch_size'], clip_range=parameterization['clip_range'], n_epochs=parameterization['n_epochs'], tensorboard_log=(str(Path.home())+'/tensorboard_logs/'+name+'/'), policy_kwargs={"net_arch": [256, 256]})
- model.learn(total_timesteps=2000000, callback=checkpoint_callback) # time steps steps of each agent; was 4 million
- mean_reward = evaluate_all_policies(name)
- tune.report(mean_reward=mean_reward)
- ray.init(address='auto')
- analysis = tune.run(
- train,
- num_samples=100,
- search_alg=ConcurrencyLimiter(optuna_search, max_concurrent=10),
- verbose=2,
- resources_per_trial={"gpu": 1, "cpu": 5},
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement