Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
- env = gym.make('MountainCarContinuous-v0')
- env.seed(101)
- np.random.seed(101)
- print('observation space:', env.observation_space)
- print('action space:', env.action_space)
- print(' - low:', env.action_space.low)
- print(' - high:', env.action_space.high)
- class Agent(nn.Module):
- def __init__(self, env, h_size=16):
- super(Agent, self).__init__()
- self.env = env
- # state, hidden layer, action sizes
- self.s_size = env.observation_space.shape[0]
- self.h_size = h_size
- self.a_size = env.action_space.shape[0]
- # define layers
- self.fc1 = nn.Linear(self.s_size, self.h_size)
- self.fc2 = nn.Linear(self.h_size, self.a_size)
- def set_weights(self, weights):
- s_size = self.s_size
- h_size = self.h_size
- a_size = self.a_size
- # separate the weights for each layer
- fc1_end = (s_size*h_size)+h_size
- fc1_W = torch.from_numpy(weights[:s_size*h_size].reshape(s_size, h_size))
- fc1_b = torch.from_numpy(weights[s_size*h_size:fc1_end])
- fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
- fc2_b = torch.from_numpy(weights[fc1_end+(h_size*a_size):])
- # set the weights for each layer
- self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
- self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
- self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
- self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
- def get_weights_dim(self):
- return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
- def forward(self, x):
- x = F.relu(self.fc1(x))
- x = F.tanh(self.fc2(x))
- return x.cpu().data
- def evaluate(self, weights, gamma=1.0, max_t=5000):
- self.set_weights(weights)
- episode_return = 0.0
- state = self.env.reset()
- for t in range(max_t):
- state = torch.from_numpy(state).float().to(device)
- action = self.forward(state)
- state, reward, done, _ = self.env.step(action)
- episode_return += reward * math.pow(gamma, t)
- if done:
- break
- return episode_return
- agent = Agent(env).to(device)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement