Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Set up training process.
- from collections import deque
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
- agent_a2c = A2CModel().to(device)
- optimizer = optim.Adam(agent_a2c.parameters(), lr=0.00015)
- env_info = env.reset(train_mode=True)[brain_name]
- states = env_info.vector_observations
- init_states = states
- n_episodes = 1
- n_steps = 10
- episode_end = False
- a2c_ep_rewards_list = []
- ep_rewards_deque = deque([0], maxlen=100) # initialize with 0
- ep_rewards = 0
- while True:
- batch_s, batch_a, batch_v_t, accu_rewards, init_states, episode_end = collect_trajectories(
- agent_a2c, env, brain_name, init_states, episode_end, n_steps)
- loss, mus, stds = learn(batch_s, batch_a, batch_v_t, agent_a2c, optimizer)
- ep_rewards += accu_rewards
- print('\rEpisode {:>4}\tEpisodic Score {:>7.3f}\tLoss {:>12.6f}'.format(
- n_episodes, np.mean(ep_rewards_deque), float(loss)), end="")
- if episode_end == True:
- if n_episodes % 100 == 0:
- print('\rEpisode {:>4}\tEpisodic Score {:>7.3f}\tLoss {:>12.6f}'.format(
- n_episodes, np.mean(ep_rewards_deque), float(loss)))
- if np.mean(ep_rewards_deque) >= 34:
- break
- a2c_ep_rewards_list.append(ep_rewards/num_agents)
- ep_rewards_deque.append(ep_rewards/num_agents)
- ep_rewards = 0
- n_episodes += 1
- episode_end = False
- # save a2c model
- pth = './checkpoint/a2c_checkpoint.pth'
- torch.save(agent_a2c.state_dict(), pth)
- a2c_ep_rewards_list = np.array(a2c_ep_rewards_list)
- np.save('./data/a2c_ep_rewards_list.npy', a2c_ep_rewards_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement