Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---------------------------------------------------------------------------------------------------------------------------------------
- PPO.PY:
- ---------------------------------------------------------------------------------------------------------------------------------------
- class PPOMemory:
- def __init__(self, batch_size, num_trajectories, num_steps_trajectory, state_size, device):
- self.states = torch.zeros((num_trajectories, num_steps_trajectory) + (state_size,)).to(device)
- self.actions = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.logprobs = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.values = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.rewards = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.next_states = torch.zeros((num_trajectories, num_steps_trajectory) + (state_size,)).to(device)
- self.dones = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.advantages = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.returns = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- def store(self, state, action, log_prob, value, reward, next_state, done, trajectory, step):
- self.states[trajectory][step] = state
- self.actions[trajectory][step] = action
- self.logprobs[trajectory][step] = log_prob
- self.values[trajectory][step] = value
- self.rewards[trajectory][step] = reward
- self.next_states[trajectory][step] = next_state
- self.dones[trajectory][step] = done
- def clear(self, num_trajectories, num_steps_trajectory, state_size, device):
- self.states = torch.zeros((num_trajectories, num_steps_trajectory) + (state_size,)).to(device)
- self.actions = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.logprobs = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.values = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.rewards = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.next_states = torch.zeros((num_trajectories, num_steps_trajectory) + (state_size,)).to(device)
- self.dones = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.advantages = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- self.returns = torch.zeros((num_trajectories, num_steps_trajectory)).to(device)
- def flatten(self, state_size):
- # Flatten data
- states_flat = self.states.reshape((-1,) + (state_size,))
- actions_flat = self.actions.reshape((-1,))
- logprobs_flat = self.logprobs.reshape((-1,))
- advantages_flat = self.advantages.reshape((-1))
- returns_flat = self.returns.reshape((-1,))
- values_flat = self.values.reshape((-1,))
- return states_flat, actions_flat, logprobs_flat, advantages_flat, returns_flat, values_flat
- def generate_batches(self, batch_size, minibatch_size):
- # Generate batch indices
- indices = np.arange(batch_size)
- batch_start_ind = np.arange(0, batch_size, minibatch_size)
- np.random.shuffle(indices)
- batches = [indices[i:i + minibatch_size] for i in batch_start_ind]
- return batches
- class BuildNetwork(nn.Module):
- def __init__(self, nn_architecture, input_size, output_size, learning_rate, device):
- super().__init__()
- # Initialize an empty list to fill with layer info
- layers = []
- # Add input layer using nn.Linear
- in_size = input_size
- out_size, _ = nn_architecture[0]
- layers.append(nn.Linear(in_size, out_size))
- # Iteratively add hidden layers with layer info from the nn_architecture parameter
- for i in range(len(nn_architecture) - 1):
- # Load layer size and activation function
- in_size, activation = nn_architecture[i]
- out_size, _ = nn_architecture[i + 1]
- # Add layer to list layers using nn.Linear
- layers.append(nn.Linear(in_size, out_size))
- # Add activation function to list layers
- if activation.lower() == "relu":
- layers.append(nn.ReLU())
- elif activation.lower() == "sigmoid":
- layers.append(nn.Sigmoid())
- elif activation.lower() == "tanh":
- layers.append(nn.Tanh())
- elif activation.lower() == "linear":
- pass
- else:
- raise ValueError(f"Unsupported activation function: {activation}")
- # Add output layer using nn.Linear
- last_in_size, last_activation = nn_architecture[-1]
- layers.append(nn.Linear(last_in_size, output_size))
- # Create network using the layer info
- self.model = nn.Sequential(*layers)
- # Set optimizer
- self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
- # Move model to device
- self.to(device)
- def forward(self, state):
- # Output unactivated outputs given a state
- output = self.model(state)
- return output
- class PPO_Agent():
- def __init__(self, env, config, device):
- self.device = device
- self.env = env
- # General
- self.state_size = config["state_size"]
- self.num_actions = config["num_actions"]
- self.num_episodes_train = config["num_episodes_train"]
- self.num_episodes_validate = config["num_episodes_validate"]
- self.num_episodes_test = config["num_episodes_test"]
- self.num_trajectories = config["num_trajectories"]
- self.num_steps_trajectory = config["num_steps_trajectory"]
- self.num_epochs = config["num_epochs"]
- self.batch_size = self.num_trajectories * self.num_steps_trajectory
- self.num_minibatches = config["num_minibatches"]
- self.minibatch_size = self.batch_size // self.num_minibatches
- # Hyperparameters
- self.nn_architecture_actor = config["nn_architecture_actor"]
- self.nn_architecture_critic = config["nn_architecture_critic"]
- self.learning_rate_actor = config["learning_rate_actor"]
- self.learning_rate_critic = config["learning_rate_critic"]
- self.gamma = config["gamma"]
- self.lam = config["lam"]
- self.clip_ratio = config["clip_ratio"]
- self.entropy_coef = config["entropy_coef"]
- self.value_coef = config["value_coef"]
- self.entropy_coef_min = config["entropy_coef_min"]
- self.entropy_coef_decay = config["entropy_coef_decay"]
- self.max_grad_norm = config["max_grad_norm"]
- # Memory Initialization
- self.memory = PPOMemory(self.batch_size, self.num_trajectories, self.num_steps_trajectory, self.state_size, self.device)
- # Network Initialization
- self.actor = BuildNetwork(self.nn_architecture_actor, self.state_size, self.num_actions, self.learning_rate_actor, self.device)
- self.critic = BuildNetwork(self.nn_architecture_critic, self.state_size, 1, self.learning_rate_critic, self.device)
- def load_weights(self, path):
- self.actor.load_state_dict(torch.load(path))
- self.actor.eval()
- def save_weights(self, path):
- torch.save(self.actor.state_dict(), path)
- def get_action(self, state):
- # Output logits given a state
- logits = self.actor(state)
- # Create a Categorical distribution from the output
- dist = Categorical(logits=logits)
- action = dist.sample()
- log_prob = dist.log_prob(action)
- return action, log_prob
- def get_value(self, state):
- # Output value give a state
- value = self.critic(state)
- return value
- def calc_advantage(self, values, next_states, rewards, dones, device):
- with torch.no_grad():
- # Initialize advantages and returns
- advantages = torch.zeros_like(values).to(device)
- returns = torch.zeros_like(values).to(device)
- # Last advantage accumulator
- advantage = torch.zeros(self.num_trajectories).to(device)
- # Iterate in reverse over time steps
- for t in reversed(range(self.num_steps_trajectory)):
- # Mask for terminal states
- mask = 1.0 - dones[:, t]
- if t == self.num_steps_trajectory - 1:
- next_values = self.get_value(next_states[:, t])
- else:
- next_values = values[:, t + 1]
- # Compute delta/TD-error
- delta = rewards[:, t] + self.gamma * next_values.view(-1) * mask - values[:, t]
- # Recursive GAE
- advantage = delta + self.gamma * self.lam * mask * advantage
- advantages[:, t] = advantage
- # Compute return
- returns[:, t] = advantage + values[:, t]
- return advantages, returns
- def compute_loss(self, states_mb, actions_mb, old_logprobs_mb, advantages_norm_mb, returns_mb):
- # Compute prob_ratio
- logits = self.actor(states_mb)
- dist = Categorical(logits=logits)
- new_logprobs_mb = dist.log_prob(actions_mb)
- prob_ratio = (new_logprobs_mb - old_logprobs_mb).exp()
- clipped_prob_ratio = torch.clamp(prob_ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
- # Compute entropy loss
- entropy = dist.entropy()
- entropy_loss = entropy.mean()
- # Compute PPO objective function
- actor_loss = -(torch.min(advantages_norm_mb * prob_ratio, advantages_norm_mb * clipped_prob_ratio).mean())
- # Compute value loss
- critic_values = self.critic(states_mb)
- critic_loss = ((critic_values - returns_mb) ** 2).mean()
- # Compute total loss
- total_loss = actor_loss + self.value_coef * critic_loss - self.entropy_coef * entropy_loss
- return total_loss, actor_loss, critic_loss, entropy_loss
- def update_networks(self):
- # Compute advantages and returns
- advantages, returns = self.calc_advantage(self.memory.values, self.memory.next_states, self.memory.rewards, self.memory.dones, self.device)
- self.memory.advantages = advantages
- self.memory.returns = returns
- # Flatten memory
- states, actions, old_logprobs, advantages, returns, values, = self.memory.flatten(self.state_size)
- for epoch in range(self.num_epochs):
- # Generate batches
- batches = self.memory.generate_batches(self.batch_size, self.minibatch_size)
- for minibatch in batches:
- # States, logprobs, action, returns for the current minibatch
- states_mb = states[minibatch]
- old_logprobs_mb = old_logprobs[minibatch]
- actions_mb = actions[minibatch]
- returns_mb = returns[minibatch]
- values_mb = values[minibatch]
- # Advantage normalization for the current minibatch
- advantages_mb = advantages[minibatch]
- advantages_norm_mb = (advantages_mb - advantages_mb.mean()) / (advantages_mb.std() + 1e-8)
- total_loss, actor_loss, critic_loss, entropy_loss = self.compute_loss(states_mb, actions_mb, old_logprobs_mb, advantages_norm_mb, returns_mb)
- # Perform updates
- self.actor.optimizer.zero_grad()
- self.critic.optimizer.zero_grad()
- total_loss.backward()
- nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
- nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
- self.actor.optimizer.step()
- self.critic.optimizer.step()
- return total_loss, actor_loss, critic_loss, entropy_loss
- def run(self, mode, num_episodes):
- rewards = []
- if mode == "training":
- training = True
- else:
- training = False
- total_loss = 0
- actor_loss = 0
- critic_loss = 0
- entropy_loss = 0
- for episode in range(num_episodes):
- # Initialize reward to zero
- reward_episode = 0
- # Keep track of the number of trajectories per policy rollout
- trajectory = episode % self.num_trajectories
- # Reset environment
- state = torch.tensor(self.env.reset(mode), dtype=torch.float32).to(self.device)
- # Perform one trajectory
- for step in range(self.num_steps_trajectory):
- # Perform one environment step
- with torch.no_grad():
- action, log_prob = self.get_action(state)
- value = self.get_value(state).squeeze()
- reward, next_state, done = self.env.step(action)
- reward_episode += reward
- # Transform to tensors
- reward = torch.tensor([reward]).to(self.device)
- next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
- done = torch.tensor([done]).to(self.device)
- # Keep track of rollout data in case of training
- if training:
- self.memory.store(state, action, log_prob, value, reward, next_state, done, trajectory, step)
- # Update state for next step
- state = next_state
- # Append episodic reward
- rewards.append(reward_episode)
- average_reward = np.mean(rewards)
- # Perform update step with rollout data and clear memory afterwards for new rollout phase
- if training and trajectory == self.num_trajectories - 1:
- total_loss, actor_loss, critic_loss, entropy_loss = self.update_networks()
- self.memory.clear(self.num_trajectories, self.num_steps_trajectory, self.state_size, self.device)
- # Log relevant data
- wandb.log({"Charts/Average Reward": average_reward}, episode)
- wandb.log({"Charts/Reward per Episode": reward_episode}, episode)
- wandb.log({"Losses/Total Loss": total_loss}, episode)
- wandb.log({"Losses/Actor Loss": actor_loss}, episode)
- wandb.log({"Losses/Critic Loss": critic_loss}, episode)
- wandb.log({"Losses/Entropy": entropy_loss}, episode)
- print(f"EPISODE: {episode + 1} / {num_episodes}, Total Reward: {reward_episode}")
- return rewards
- ---------------------------------------------------------------------------------------------------------------------------------------
- ENVIRONMENT.PY
- ---------------------------------------------------------------------------------------------------------------------------------------
- # actions: 0 (nothing), 1 (up), 2 (right), 3 (down), 4 (left)
- # positions in grid:
- # - (0,0) is upper left corner
- # - first index is vertical (increasing from top to bottom)
- # - second index is horizontal (increasing from left to right)
- # if new item appears in a cell into which the agent moves/at which the agent stays in the same time step,
- # it is not picked up (if agent wants to pick it up, it has to stay in the cell in the next time step)
- import random
- from typing import List, Tuple
- import pandas as pd
- from copy import deepcopy
- from itertools import compress
- import numpy as np
- # TODO: delete / move
- def manhatten_dist(pos: Tuple[int, int], tgt: Tuple[int, int]) -> Tuple[int, int]:
- return abs(pos[0] - tgt[0]) + abs(pos[1] - tgt[1])
- class Environment(object):
- def __init__(self, variant, data_dir):
- self.variant = variant
- self.vertical_cell_count = 5
- self.horizontal_cell_count = 5
- self.vertical_idx_target = 2
- self.horizontal_idx_target = 0
- self.target_loc = (self.vertical_idx_target, self.horizontal_idx_target)
- self.episode_steps = 200
- self.max_response_time = 15 if self.variant == 2 else 10
- self.reward = 25 if self.variant == 2 else 15
- self.data_dir = data_dir
- self.training_episodes = pd.read_csv(self.data_dir + f"/variant_{self.variant}/training_episodes.csv")
- self.training_episodes = self.training_episodes.training_episodes.tolist()
- self.validation_episodes = pd.read_csv(self.data_dir + f"/variant_{self.variant}/validation_episodes.csv")
- self.validation_episodes = self.validation_episodes.validation_episodes.tolist()
- self.test_episodes = pd.read_csv(self.data_dir + f"/variant_{self.variant}/test_episodes.csv")
- self.test_episodes = self.test_episodes.test_episodes.tolist()
- self.remaining_training_episodes = deepcopy(self.training_episodes)
- self.validation_episode_counter = 0
- if self.variant == 0 or self.variant == 2:
- self.agent_capacity = 1
- else:
- self.agent_capacity = 3
- if self.variant == 0 or self.variant == 1:
- self.eligible_cells = [(0,0), (0,1), (0,2), (0,3), (0,4),
- (1,0), (1,1), (1,2), (1,3), (1,4),
- (2,0), (2,1), (2,2), (2,3), (2,4),
- (3,0), (3,1), (3,2), (3,3), (3,4),
- (4,0), (4,1), (4,2), (4,3), (4,4)]
- else:
- self.eligible_cells = [(0,0), (0,2), (0,3), (0,4),
- (1,0), (1,2), (1,4),
- (2,0), (2,2), (2,4),
- (3,0), (3,1), (3,2), (3,4),
- (4,0), (4,1), (4,2), (4,4)]
- self.current_episode_target_count = 0 # Counts number of items dropped off at target cell
- # initialize a new episode (specify if training, validation, or testing via the mode argument)
- def reset(self, mode):
- modes = ["training", "validation", "testing"]
- if mode not in modes:
- raise ValueError("Invalid mode. Expected one of: %s" % modes)
- self.step_count = 0
- self.agent_loc = (self.vertical_idx_target, self.horizontal_idx_target)
- self.agent_load = 0 # number of items loaded (0 or 1, except for first extension, where it can be 0,1,2,3)
- self.item_locs = []
- self.item_times = []
- self.past_items = []
- #self.item_distances = []
- if mode == "testing":
- episode = self.test_episodes[0]
- self.test_episodes.remove(episode)
- elif mode == "validation":
- episode = self.validation_episodes[self.validation_episode_counter]
- self.validation_episode_counter = (self.validation_episode_counter + 1) % 100
- else:
- if not self.remaining_training_episodes:
- self.remaining_training_episodes = deepcopy(self.training_episodes)
- episode = random.choice(self.remaining_training_episodes)
- self.remaining_training_episodes.remove(episode)
- self.data = pd.read_csv(self.data_dir + f"/variant_{self.variant}/episode_data/episode_{episode:03d}.csv", index_col=0)
- #For CNN:
- self.past_items = np.zeros((self.vertical_cell_count, self.horizontal_cell_count), dtype=int)
- return self.get_obs()
- # take one environment step based on the action act
- def step(self, act):
- self.step_count += 1
- rew = 0
- # done signal (1 if episode ends, 0 if not)
- if self.step_count == self.episode_steps:
- done = 1
- else:
- done = 0
- #MP
- new_loc = self.agent_loc
- # agent movement
- if act != 0:
- if act == 1: # up
- new_loc = (self.agent_loc[0] - 1, self.agent_loc[1])
- elif act == 2: # right
- new_loc = (self.agent_loc[0], self.agent_loc[1] + 1)
- elif act == 3: # down
- new_loc = (self.agent_loc[0] + 1, self.agent_loc[1])
- elif act == 4: # left
- new_loc = (self.agent_loc[0], self.agent_loc[1] - 1)
- if new_loc in self.eligible_cells:
- self.agent_loc = new_loc
- rew += -1
- # item pick-up
- if (self.agent_load < self.agent_capacity) and (self.agent_loc in self.item_locs):
- self.agent_load += 1
- idx = self.item_locs.index(self.agent_loc)
- self.item_locs.pop(idx)
- self.item_times.pop(idx)
- rew += self.reward / 2
- # item drop-off
- if self.agent_loc == self.target_loc:
- rew += self.agent_load * self.reward / 2
- self.current_episode_target_count += self.agent_load
- self.agent_load = 0
- # track how long ago items appeared
- self.item_times = [i + 1 for i in self.item_times]
- # remove items for which max response time is reached
- mask = [i < self.max_response_time for i in self.item_times]
- self.item_locs = list(compress(self.item_locs, mask))
- self.item_times = list(compress(self.item_times, mask))
- # add items which appear in the current time step
- new_items = self.data[self.data.step == self.step_count]
- new_items = list(zip(new_items.vertical_idx, new_items.horizontal_idx))
- new_items = [i for i in new_items if i not in self.item_locs] # not more than one item per cell
- self.item_locs += new_items
- self.item_times += [0] * len(new_items)
- #FOR CNN
- for loc in new_items:
- self.past_items[loc] += 1
- # get new observation
- next_obs = self.get_obs()
- return rew, next_obs, done
- def get_state_size(self):
- size = self.reset("training").shape[0]
- return size
- def get_obs(self) -> List[float]:
- grid_shape = (self.vertical_cell_count, self.horizontal_cell_count)
- # Agent position
- agent_position = np.zeros(grid_shape)
- agent_position[self.agent_loc] = 1
- # Item positions
- item_positions = np.zeros(grid_shape)
- for loc in self.item_locs:
- item_positions[loc] = 1
- # Remaining times of items (normalized)
- remaining_times = np.zeros(grid_shape)
- for loc, time in zip(self.item_locs, self.item_times):
- remaining_times[loc] = (self.max_response_time - time) / self.max_response_time
- # Past items (spawn counts at each position)
- past_items_vector = self.past_items_vector.flatten()
- total_spawns = past_items_vector.sum()
- if total_spawns > 0:
- past_items_vector = past_items_vector / total_spawns
- else:
- past_items_vector = np.zeros_like(past_items_vector)
- # Target location
- target_location = np.zeros(grid_shape)
- target_location[self.target_loc] = 1
- # Free capacity
- free_capacity = self.agent_capacity - self.agent_load
- # Manhattan distances to items and distance to closest item
- distance_to_closest_item = 1000
- remaining_time_closest_item = 1000
- distance_closest_item_to_target = 1000
- manhattan_distance_items = np.zeros(grid_shape)
- for loc in self.item_locs:
- manhattan_distance_items[loc] = manhatten_dist(self.agent_loc, loc)
- if manhattan_distance_items[loc] < distance_to_closest_item:
- distance_to_closest_item = manhattan_distance_items[loc]
- remaining_time_closest_item = remaining_times[loc]
- distance_closest_item_to_target = manhatten_dist(loc, self.target_loc)
- # Manhatten distance to target
- manhattan_distance_target = np.zeros(grid_shape)
- manhattan_distance_target[self.target_loc] = manhatten_dist(self.agent_loc, self.target_loc)
- # Distance to the closest wall
- distance_to_walls = [
- self.agent_loc[0],
- self.vertical_cell_count - 1 - self.agent_loc[0],
- self.agent_loc[1],
- self.horizontal_cell_count - 1 - self.agent_loc[1]
- ]
- # Current number of items on the field
- num_items = len(self.item_locs)
- state = np.concatenate([
- np.array(agent_position).flatten(), # Dim 25
- np.array(item_positions).flatten(), # Dim 25
- np.array(remaining_times).flatten(), # Dim 25
- np.array(target_location).flatten(), # Dim 25
- np.array(free_capacity).flatten(), # Dim 1
- np.array(manhattan_distance_items).flatten(), # Dim 25
- np.array(manhattan_distance_target).flatten(), # Dim 25
- np.array(distance_to_closest_item).flatten(), # Dim 1
- np.array(remaining_time_closest_item).flatten(), # Dim 1
- np.array(distance_closest_item_to_target).flatten(), # Dim 1
- np.array(distance_to_walls).flatten(), # Dim 4
- np.array(num_items).flatten(), # Dim 1
- ])
- return state
Add Comment
Please, Sign In to add comment