Advertisement
Guest User

Untitled

a guest
Apr 5th, 2016
293
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.73 KB | None | 0 0
  1. from pygame.constants import K_DOWN, K_UP, K_LEFT, K_RIGHT
  2. from pygame_player import PyGamePlayer
  3. from skimage import data
  4. from skimage.color import rgb2gray
  5. from skimage.transform import rescale
  6. from skimage import io, exposure, img_as_uint, img_as_float
  7. from collections import deque
  8. from random import randint
  9. import matplotlib.pyplot as plt
  10. import matplotlib.image as mpimg
  11. import random
  12. import sys
  13. import os
  14. import time
  15.  
  16. import numpy as np
  17. import theano
  18. import theano.tensor as T
  19.  
  20. import lasagne
  21. ACTIONS_COUNT = 5
  22. STATE_FRAMES = 4
  23. RESIZED_SCREEN_X, RESIZED_SCREEN_Y = 80, 80
  24. MINI_BATCH_SIZE = 32
  25. all_actions = [K_DOWN, K_UP, K_LEFT, K_RIGHT, []]
  26.  
  27. class SnakePlayer(PyGamePlayer):
  28.     ACTIONS_COUNT = 5  # number of valid actions. In this case up, still and down
  29.     FUTURE_REWARD_DISCOUNT = 0.99  # decay rate of past observations
  30.     OBSERVATION_STEPS = 500.  # time steps to observe before training
  31.     EXPLORE_STEPS = 2000000.  # frames over which to anneal epsilon
  32.     INITIAL_RANDOM_ACTION_PROB = 1.0  # starting chance of an action being random
  33.     FINAL_RANDOM_ACTION_PROB = 0.05  # final chance of an action being random
  34.     MEMORY_SIZE = 590000  # number of observations to remember
  35.     MINI_BATCH_SIZE = 32  # size of mini batches
  36.     STATE_FRAMES = 4  # number of frames to store in the state
  37.     RESIZED_SCREEN_X, RESIZED_SCREEN_Y = (80, 80)
  38.     OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
  39.     SAVE_EVERY_X_STEPS = 10000
  40.     LEARN_RATE = 1e-6
  41.     STORE_SCORES_LEN = 200.
  42.  
  43.  
  44.     def __init__(self, force_game_fps=5, run_real_time=False, playback_mode=False):
  45.         """
  46.        Example class for playing Pong
  47.        """
  48.  
  49.         # Prepare Theano variables for inputs and targets
  50.         input_var = T.tensor4('inputs')
  51.         target_var = T.ivector('targets')
  52.         states = T.tensor4('states')
  53.         super(SnakePlayer, self).__init__(force_game_fps=force_game_fps, run_real_time=run_real_time)
  54.         # self.last_score = 1
  55.         self._output_layer = SnakePlayer.build_cnn(input_var)
  56.         self._previous_observations = deque()
  57.         self._last_score = 1
  58.  
  59.  
  60.         self._target = T.dscalar('target')
  61.  
  62.         self._action = T.TensorType('float64', (False,)*ACTIONS_COUNT)
  63.  
  64.         self._states_shared = theano.shared(
  65.             np.ones((MINI_BATCH_SIZE, STATE_FRAMES, RESIZED_SCREEN_X, RESIZED_SCREEN_Y),
  66.                      dtype=theano.config.floatX))
  67.  
  68.         readout = lasagne.layers.get_output(self._output_layer, self._states_shared)
  69.         print readout[0].eval()
  70.  
  71.         self._playback_mode = playback_mode
  72.         self._observations = deque()
  73.         self._last_scores = deque()
  74.         # set the first action to do nothing
  75.         self._last_action = np.zeros(self.ACTIONS_COUNT)
  76.         self._last_action[1] = 1
  77.  
  78.         self._last_state = None
  79.         self._probability_of_random_action = self.INITIAL_RANDOM_ACTION_PROB
  80.         self._time = 0
  81.  
  82.  
  83.  
  84.  
  85.     def get_keys_pressed(self, screen_array, feedback, terminal):
  86.         scr = rescale(screen_array, 0.25)
  87.         scr = rgb2gray(scr)
  88.         action = all_actions[randint(0,3)]
  89.         # return [action]
  90.  
  91.         ###############################################
  92.         reward = feedback
  93.         if reward != 0.0:
  94.             self._last_scores.append(reward)
  95.             if len(self._last_scores) > self.STORE_SCORES_LEN:
  96.                 self._last_scores.popleft()
  97.  
  98.         # first frame must be handled differently
  99.         if self._last_state is None:
  100.             # the _last_state will contain the image data from the last self.STATE_FRAMES frames
  101.             self._last_state = np.stack(tuple(scr for _ in range(self.STATE_FRAMES)), axis=2)
  102.             return SnakePlayer._key_presses_from_action(self._last_action)
  103.  
  104.         scr = np.reshape(scr, (RESIZED_SCREEN_X, RESIZED_SCREEN_Y, 1))
  105.         print scr.shape
  106.         print np.rollaxis(scr, 2, 0).shape
  107.         print np.rollaxis(self._last_state, 2, 0).shape
  108.         current_state = np.append(scr, self._last_state[:, :, 1:], axis=2)
  109.  
  110.         print current_state.shape
  111.  
  112.         if not self._playback_mode:
  113.             # store the transition in previous_observations
  114.             self._observations.append((self._last_state, self._last_action, reward, current_state, terminal))
  115.  
  116.             if len(self._observations) > self.MEMORY_SIZE:
  117.                 self._observations.popleft()
  118.  
  119.             # only train if done observing
  120.             if len(self._observations) > self.OBSERVATION_STEPS:
  121.                 print "start training..."
  122.                 self._train()
  123.                 self._time += 1
  124.  
  125.             print len(self._observations)
  126.  
  127.         # update the old values
  128.         self._last_state = current_state
  129.  
  130.         self._last_action = self._choose_next_action()
  131.  
  132.         if not self._playback_mode:
  133.             # gradually reduce the probability of a random actionself.
  134.             if self._probability_of_random_action > self.FINAL_RANDOM_ACTION_PROB \
  135.                     and len(self._observations) > self.OBSERVATION_STEPS:
  136.                 self._probability_of_random_action -= \
  137.                     (self.INITIAL_RANDOM_ACTION_PROB - self.FINAL_RANDOM_ACTION_PROB) / self.EXPLORE_STEPS
  138.  
  139.             print("Time: %s random_action_prob: %s reward %s scores differential %s" %
  140.                   (self._time, self._probability_of_random_action, reward,
  141.                    sum(self._last_scores) / self.STORE_SCORES_LEN))
  142.  
  143.         return SnakePlayer._key_presses_from_action(self._last_action)
  144.  
  145.         ###############################################
  146.  
  147.     def get_feedback(self):
  148.         # import must be done here because otherwise importing would cause the game to start playing
  149.         from snake import snake
  150.         score = snake.length
  151.         # get the difference in score between this and the last run
  152.         score_change = (score - self._last_score)
  153.         self.last_score = score
  154.  
  155.         return float(score_change), False
  156.  
  157.     def _choose_next_action(self):
  158.         new_action = np.zeros([self.ACTIONS_COUNT])
  159.  
  160.         if self._playback_mode or (random.random() <= self._probability_of_random_action):
  161.             # choose an action randomly
  162.             action_index = random.randrange(self.ACTIONS_COUNT)
  163.         else:
  164.             # choose an action given our last state
  165.             # readout_t = self._session.run(self._output_layer, feed_dict={self._input_layer: [self._last_state]})[0]
  166.             last_state = np.rollaxis(self._last_state, 2, 0)
  167.             readout_t = lasagne.layers.get_output(self._output_layer, self._last_state)
  168.             action_index = np.argmax(readout_t)
  169.  
  170.         new_action[action_index] = 1
  171.         print "****************************************** ACTION *******************************************"
  172.         return new_action
  173.  
  174.     def _train(self):
  175.         # Prepare Theano variables for inputs and targets
  176.         input_var = T.tensor4('inputs')
  177.         target_var = T.ivector('targets')
  178.         states = T.tensor4('states')
  179.         print "sampling mini batch..."
  180.         # sample a mini_batch to train on
  181.         mini_batch = random.sample(self._observations, self.MINI_BATCH_SIZE)
  182.         # get the batch variables
  183.         previous_states = [d[self.OBS_LAST_STATE_INDEX] for d in mini_batch]
  184.         actions = [d[self.OBS_ACTION_INDEX] for d in mini_batch]
  185.         rewards = [d[self.OBS_REWARD_INDEX] for d in mini_batch]
  186.         current_states = np.array([d[self.OBS_CURRENT_STATE_INDEX] for d in mini_batch])
  187.         agents_expected_reward = []
  188.         # print np.rollaxis(current_states, 3, 1).shape
  189.         print "compiling current states..."
  190.         current_states = np.rollaxis(current_states, 3, 1)
  191.         current_states = theano.compile.sharedvalue.shared(current_states)
  192.  
  193.         print "getting network output from current states..."
  194.         agents_reward_per_action = lasagne.layers.get_output(self._output_layer, current_states)
  195.         print agents_reward_per_action.eval()
  196.  
  197.  
  198.         print "rewards adding..."
  199.         for i in range(len(mini_batch)):
  200.             if mini_batch[i][self.OBS_TERMINAL_INDEX]:
  201.                 # this was a terminal frame so need so scale future reward...
  202.                 agents_expected_reward.append(rewards[i])
  203.             else:
  204.                 agents_expected_reward.append(
  205.                     rewards[i] + self.FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i].eval()))
  206.  
  207.         print len(agents_expected_reward)
  208.         print len(actions)
  209.  
  210.     @staticmethod
  211.     def build_cnn(input_var=None):
  212.         l_in = lasagne.layers.InputLayer(shape=(None, 4, 80, 80),
  213.                                          input_var=input_var)
  214.  
  215.         l_conv1 = lasagne.layers.Conv2DLayer(
  216.             l_in,
  217.             num_filters=32,
  218.             filter_size=(8, 8),
  219.             stride=(4, 4),
  220.             nonlinearity=lasagne.nonlinearities.rectify,
  221.             W=lasagne.init.HeUniform(),
  222.             b=lasagne.init.Constant(.1)
  223.         )
  224.  
  225.         l_conv2 = lasagne.layers.Conv2DLayer(
  226.             l_conv1,
  227.             num_filters=64,
  228.             filter_size=(4, 4),
  229.             stride=(2, 2),
  230.             nonlinearity=lasagne.nonlinearities.rectify,
  231.             W=lasagne.init.HeUniform(),
  232.             b=lasagne.init.Constant(.1)
  233.         )
  234.  
  235.         l_conv3 = lasagne.layers.Conv2DLayer(
  236.             l_conv2,
  237.             num_filters=64,
  238.             filter_size=(3, 3),
  239.             stride=(1, 1),
  240.             nonlinearity=lasagne.nonlinearities.rectify,
  241.             W=lasagne.init.HeUniform(),
  242.             b=lasagne.init.Constant(.1)
  243.         )
  244.  
  245.         l_hidden1 = lasagne.layers.DenseLayer(
  246.             l_conv3,
  247.             num_units=512,
  248.             nonlinearity=lasagne.nonlinearities.rectify,
  249.             W=lasagne.init.HeUniform(),
  250.             b=lasagne.init.Constant(.1)
  251.         )
  252.  
  253.         l_out = lasagne.layers.DenseLayer(
  254.             l_hidden1,
  255.             num_units=5,
  256.             nonlinearity=None,
  257.             W=lasagne.init.HeUniform(),
  258.             b=lasagne.init.Constant(.1)
  259.         )
  260.         network_shape = lasagne.layers.get_output_shape(l_out)
  261.         print network_shape
  262.         network = l_out
  263.         return network
  264.  
  265.     @staticmethod
  266.     def _key_presses_from_action(action_set):
  267.         if action_set[0] == 1:
  268.             return [K_DOWN]
  269.         elif action_set[1] == 1:
  270.             return [K_RIGHT]
  271.         elif action_set[2] == 1:
  272.             return [K_UP]
  273.         elif action_set[3] == 1:
  274.             return [K_UP]
  275.         elif action_set[4] == 1:
  276.             return []
  277.         raise Exception("Unexpected action")
  278.  
  279. if __name__ == '__main__':
  280.     player = SnakePlayer()
  281.     player.playing = True
  282.     # importing pong will start the game playing
  283.     import snake
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement