Untitled

#======================================================================
# DDQN in Grid World
# Author : Junmo Nam
# Reference :  Van Hasselt, Hado, Arthur Guez, and David Silver.
#               "Deep reinforcement learning with double q-learning."
#               Thirtieth AAAI Conference on Artificial Intelligence. 2016.
# BEGAS
#======================================================================


#======================================================================
# Environment setting
#======================================================================

#step function for give information after do action
step = function(state,action){

  #x and y in state
  x = state$x
  y = state$y

  #do action
  if(x==1 & action == 'up'| x== nrow(grid) & action=='down'|
     y==1 & action == 'left'| y==ncol(grid) & action == 'right'){
    #action not allowed : penalty
    reward = -1.5
    next_state = state
  }else if(action == 'up'){
    reward = grid[x-1,y]
    next_state = list(x = x-1,y = y)
  }else if(action == 'down'){
    reward = grid[x+1,y]
    next_state = list(x = x+1,y = y)
  }else if(action == 'left'){
    reward = grid[x,y-1]
    next_state = list(x = x,y = y-1)
  }else{
    reward = grid[x,y+1]
    next_state = list(x = x,y = y+1)
  }

  if(reward==-1){ #face wall : go back
    reward = -1.75
    next_state = state
  }

  return(list(next_state = next_state,reward =  reward))
}

#get action from given state
get_action = function(state,epsilon,qtable,actions){

  ifelse(sample(0:1,1,prob = c(epsilon,1-epsilon))==1, #epsilon check
         as.character((qtable %>% filter(x==state$x,y==state$y) %>% filter(q == max(q)) %>% sample_n(1))$action), #do max q action
         sample(actions,1))#random action
}


#Define built model function for training q (DQN)
build_model = function(state_size, action_size,learning_rate){
  model <- keras_model_sequential() %>%
    # 1st layer
    layer_dense(input_shape = state_size,30,kernel_initializer = 'he_uniform',activation = 'relu') %>%
    # 2nd layer
    layer_dense(30, kernel_initializer = 'he_uniform',activation = 'relu') %>%
    # 3rd layer : output
    layer_dense(action_size,activation = 'linear') %>%
    #compiling model
    compile(optimizer = optimizer_adam(lr = learning_rate),
            loss = 'mse')
}

#training model function : DDQN
double_trainer = function(model,memory,target_model,batch_size,discount,qtable){

  #define sample(mini_batch) and empty states
  mini_batch = memory %>% sample_n(batch_size)

  #saving values : SARS
  states = mini_batch[,1:2] #states vars
  names(states) = c('x','y')
  actions = mini_batch %>% select(action)
  rewards = mini_batch %>% select(reward)
  next_states = mini_batch %>% select(grep('next_state',names(.)))
  names(next_states) = c('x','y')

  #get dones
  dones = mini_batch %>% select(done)

  #make q(s) and q(s')
  target = predict(model,data.matrix(states))
  target_val = predict(target_model,data.matrix(next_states))

  #Bellman equation for updating target
  for(i in 1:batch_size){
    ifelse(dones[i,],
           target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,], # when process is done, column index for given action
           #update target by bellman equation, column index for given action
           #      using DDQN method : R_t+1 + discount*Q(s', argmax(Q(s'))))
           target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,] +
             discount*(filter(qtable,x==next_states[i,1],y==next_states[i,2],
                              action ==c('up','down','left','right')[which.max(target_val[i,])]) %>% sample_n(1))$q
    )
  }
  return(list(states_batch = data.matrix(states),target = target)) #batch components for fit model
}

#training model function : DQN
single_trainer = function(model,memory,target_model,batch_size,discount,qtable){

  #define sample(mini_batch) and empty states
  mini_batch = memory %>% sample_n(batch_size)

  #saving values : SARS
  states = mini_batch[,1:2] #states vars
  names(states) = c('x','y')
  actions = mini_batch %>% select(action)
  rewards = mini_batch %>% select(reward)
  next_states = mini_batch %>% select(grep('next_state',names(.)))
  names(next_states) = c('x','y')

  #get dones
  dones = mini_batch %>% select(done)

  #make q(s) and q(s')
  target = predict(model,data.matrix(states))
  target_val = predict(target_model,data.matrix(next_states))

  #Bellman equation for updating target
  for(i in 1:batch_size){
    ifelse(dones[i,],
           target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,], # when process is done, column index for given action
           #update target by bellman equation, column index for given action
           #      using DDQN method : R_t+1 + discount*Q(s', argmax(Q(s'))))
           target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,] + discount*max(target_val[i,])
    )
  }
  return(list(states_batch = data.matrix(states),target = target)) #batch components for fit model
}


#======================================================================
# DDQN Agent
#======================================================================


#training agent function
double_dqn_agent = function(episode,params,grid,model_weight = NULL,double = T){

  #check trainer will be DDQN or DQN
  if(double){
    trainer = double_trainer
  }else{
    trainer = single_trainer
  }


  #claim parameters from params
  discount = params$discount
  learning_rate = params$learning_rate
  batch_size = params$batch_size
  epsilon = params$epsilon
  epsilon_decay = params$epsilon_decay
  epsilon_min = params$epsilon_min
  train_start = params$train_start
  max_memory = params$max_memory

  #calculate grid size
  grid_size = dim(grid) %>% prod

  # set memory dataframe
  memory = data.frame()

  #allow 4 actions
  actions = c('up','down','left','right')

  #claim size of action and state
  state_size = 2
  action_size = length(actions)

  #initiate model
  model = build_model(state_size,action_size,learning_rate)
  target_model = build_model(state_size,action_size,learning_rate)

  #set models' weight as same
  set_weights(target_model,get_weights(model))

  #optional : if preset for model weight exists
  if(!is.null(model_weight)){
    set_weights(model,model_weight)
    set_weights(target_model,model_weight)
  }

  #claim q table
  qtable = expand.grid(x = 1:nrow(grid),y = 1:ncol(grid),action = actions,q = 0)


  #recording train loss
  loss = c()

  #recording score
  scores = c()

  #recording success frequency
  sfreq = c()

  #episode loop
  for(i in 1:episode){

    #set first value
    done = F
    score = 0

    #reset state and get first action
    if(i %% 10 == 0){ #start at first position
      state = list(x = 1, y = 1)
    }else{ #random initiate
      state = list(x = sample(1:nrow(grid),1),y = sample(1:ncol(grid),1))
    }

    #small memory for give agent penalty
    small_memory = c()

    n_step = 0
    repeat{ #loop a single episode

      #update small memory
      small_memory = append(small_memory,state %>% paste(collapse = ''))

      #start action by given state
      action = get_action(state,epsilon,qtable,actions)

      #step by given action and state
      time_step =step(state,action)


      n_step = n_step+1

      #updating values
      reward = ifelse(time_step$reward==1,2*grid_size,time_step$reward) - 0.04  #0.04 = cost of making movement
      next_state = time_step$next_state
      score = score + reward
      done = (time_step$reward==1 | -grid_size*0.5 >= score) #finish or prevent infinite loop

      if(done==F & next_state %>% paste(collapse = '') %in% small_memory){
        reward = reward-0.9 #penalty when agent go place it went before
      }

      #append memory data
      memory = data.frame(state = state,
                          action=action,
                          reward=reward,
                          next_state = next_state,
                          done=done) %>%
        rbind(memory,.)

      #keep memory in max_memory size
      if(nrow(memory)>max_memory){
        memory = memory[-1,]
      }

      #training model
      if(nrow(memory)>=train_start){ # when memory has enough data
        #update epsilon
        epsilon = ifelse(epsilon > epsilon_min,epsilon*epsilon_decay,epsilon)

        #run training model and get loss
        fit_vars = trainer(model,memory,target_model,batch_size,discount,qtable)
        model %>% fit(fit_vars$states_batch, fit_vars$target,
                      batch_size, epochs=1, verbose=0) #training
        train.loss =  model$history$history$loss[[1]]

      }

      #update qtable
      qtable = predict(model,expand.grid(x=1:nrow(grid),y=1:ncol(grid)) %>% data.matrix) %>%
        data.frame %>% cbind(expand.grid(x=1:nrow(grid),y=1:ncol(grid)),.)
      colnames(qtable)[-(1:2)] = actions
      qtable = melt(qtable,c('x','y'))
      colnames(qtable)[3:4] = c('action','q')

      #update and state
      state = next_state

      if(done){ #when episode is done
        set_weights(target_model,get_weights(model))#update target model by weight of model
        loss[i] = ifelse(exists('train.loss'),train.loss,NA)
        scores[i] = score
        small_memory = append(small_memory,state %>% paste(collapse = '')) #update small memory
        break
      }

    } #loop

    #update srtae and check
    sfreq[i] = ifelse(time_step$reward==1,1,0)

    cat('Episode ',i,'is done \n',
        'Epsilon :',epsilon,' \n',
        'Score :',score,' \n',
        'Loss :',loss[i],' \n',
        'Memory size :',nrow(memory),' \n',
        'Steps :',n_step,' \n',
        'Result :',ifelse(time_step$reward==1,'Success','Fail'),' \n',
        'Winning rate(10)',ifelse(length(sfreq)<10,'-Less than 10',sum(tail(sfreq,10))/10),'\n',
        'Overall Winning rate',sum(sfreq)/i,'\n',
        '----------- \n') #message that episode is done

    if(sum(tail(sfreq,20))/20 >=0.95){
      cat("Last 20 cases' Winning rate is more than 95% : stop training \n")
      break
    }

  }#episode end

  return(list(loss = loss,score = scores,model = model,target_model = target_model,
              last_moveset = small_memory))
}#function end


#======================================================================
# Visualization
#======================================================================

#draw policy by given model
draw_policy_model = function(grid,qmodel,method){

  #mapping destination and obstacle
  obstacle = which(grid==-1,arr.ind = T)
  destination = which(grid==1,arr.ind = T)

  #allow 5 actions
  actions = c('up','down','left','right')

  #initiate start point and qtable
  state = list(x= 1,y = 1)
  qpolicy = predict(qmodel,expand.grid(state.x = 1:nrow(grid),state.y = 1:ncol(grid)) %>% data.matrix) %>%
    as.data.frame %>% mutate(action = actions[max.col(.)]) %>% select(action) %>%
    cbind(expand.grid(x = 1:nrow(grid),y = 1:ncol(grid)),.)

  #do action
  action = (qpolicy %>% filter(x==state$x,y==state$y))$action

  if(action == 'up' | action == 'left'){
    stop('Your policy ask agent to do illegal movement : stop')
  }


  #make record dataframe
  record = data.frame(x=state$x,y=state$y,act = action)

  #do recording by given q policy
  repeat{

    #update state
    res = step(state,action)
    state = res$next_state
    done = (res$reward==1)


    if(!done){

      #choose action by existing policy
      action = (qpolicy %>% filter(x==state$x,y==state$y))$action

      #check
      if(state$x==nrow(grid) & action == 'down' | state$y==ncol(grid) & action == 'right' |
         state$x==1 & action == 'up' | state$y==1 & action == 'left'){
        stop('Your policy ask agent to do illegal movement : stop')
      }

      #update record
      record = rbind(record,data.frame(x=state$x,y=state$y,act = action))


      #chekc policy ask to loop or not
      if(TRUE %in% duplicated(record)){
        stop('Your Policy ask agent to go visted area : This might cause infinite loop. stop')
      }
    }else{

    #update record
    record = rbind(record,data.frame(x=state$x,y=state$y,act = action))
    break
    }

  }#end loop

  #visualization : make empty grid object
  grid_plot = expand.grid(x=1:(ncol(grid)+1),y=1:(nrow(grid)+1)) %>% ggplot(aes(x,y))+
    geom_point(alpha=0)+
    geom_vline(xintercept = 1:(ncol(grid)+1))+
    geom_hline(yintercept = 1:(nrow(grid)+1))+
    ggtitle('Grid World Policy Viewer',
            subtitle = paste(method,'result'))+
    theme_bw()

  #add obstacle and destination
  for(i in 1:nrow(obstacle)){
    x_add = obstacle[i,2]+0.5
    y_add = (nrow(grid)+1)-obstacle[i,1]+0.5
    grid_plot = grid_plot +
      geom_point(aes(x = !!x_add, y= !!y_add),shape = 2,color = 'dark green',size = 2)
  }

  #add destination
  grid_plot = grid_plot +
    geom_point(aes(x = destination[1,2]+0.5,
                   y= (nrow(grid)+1)-destination[1,1]+0.5),
               shape = 9,color = 'red',size = 2.5)

  #update empty plot
  for(i in 1:nrow(record)){
    x_add = record[i,2]+0.5
    y_add = (nrow(grid)+1)-record[i,1]+0.5

    grid_plot = grid_plot +
      geom_point(aes(x = !!x_add,y = !!y_add),
                 color = 'blue',shape = 13, size = 1.8)
  }

  return(grid_plot)

}