Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #======================================================================
- # DDQN in Grid World
- # Author : Junmo Nam
- # Reference : Van Hasselt, Hado, Arthur Guez, and David Silver.
- # "Deep reinforcement learning with double q-learning."
- # Thirtieth AAAI Conference on Artificial Intelligence. 2016.
- # BEGAS
- #======================================================================
- #======================================================================
- # Environment setting
- #======================================================================
- #step function for give information after do action
- step = function(state,action){
- #x and y in state
- x = state$x
- y = state$y
- #do action
- if(x==1 & action == 'up'| x== nrow(grid) & action=='down'|
- y==1 & action == 'left'| y==ncol(grid) & action == 'right'){
- #action not allowed : penalty
- reward = -1.5
- next_state = state
- }else if(action == 'up'){
- reward = grid[x-1,y]
- next_state = list(x = x-1,y = y)
- }else if(action == 'down'){
- reward = grid[x+1,y]
- next_state = list(x = x+1,y = y)
- }else if(action == 'left'){
- reward = grid[x,y-1]
- next_state = list(x = x,y = y-1)
- }else{
- reward = grid[x,y+1]
- next_state = list(x = x,y = y+1)
- }
- if(reward==-1){ #face wall : go back
- reward = -1.75
- next_state = state
- }
- return(list(next_state = next_state,reward = reward))
- }
- #get action from given state
- get_action = function(state,epsilon,qtable,actions){
- ifelse(sample(0:1,1,prob = c(epsilon,1-epsilon))==1, #epsilon check
- as.character((qtable %>% filter(x==state$x,y==state$y) %>% filter(q == max(q)) %>% sample_n(1))$action), #do max q action
- sample(actions,1))#random action
- }
- #Define built model function for training q (DQN)
- build_model = function(state_size, action_size,learning_rate){
- model <- keras_model_sequential() %>%
- # 1st layer
- layer_dense(input_shape = state_size,30,kernel_initializer = 'he_uniform',activation = 'relu') %>%
- # 2nd layer
- layer_dense(30, kernel_initializer = 'he_uniform',activation = 'relu') %>%
- # 3rd layer : output
- layer_dense(action_size,activation = 'linear') %>%
- #compiling model
- compile(optimizer = optimizer_adam(lr = learning_rate),
- loss = 'mse')
- }
- #training model function : DDQN
- double_trainer = function(model,memory,target_model,batch_size,discount,qtable){
- #define sample(mini_batch) and empty states
- mini_batch = memory %>% sample_n(batch_size)
- #saving values : SARS
- states = mini_batch[,1:2] #states vars
- names(states) = c('x','y')
- actions = mini_batch %>% select(action)
- rewards = mini_batch %>% select(reward)
- next_states = mini_batch %>% select(grep('next_state',names(.)))
- names(next_states) = c('x','y')
- #get dones
- dones = mini_batch %>% select(done)
- #make q(s) and q(s')
- target = predict(model,data.matrix(states))
- target_val = predict(target_model,data.matrix(next_states))
- #Bellman equation for updating target
- for(i in 1:batch_size){
- ifelse(dones[i,],
- target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,], # when process is done, column index for given action
- #update target by bellman equation, column index for given action
- # using DDQN method : R_t+1 + discount*Q(s', argmax(Q(s'))))
- target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,] +
- discount*(filter(qtable,x==next_states[i,1],y==next_states[i,2],
- action ==c('up','down','left','right')[which.max(target_val[i,])]) %>% sample_n(1))$q
- )
- }
- return(list(states_batch = data.matrix(states),target = target)) #batch components for fit model
- }
- #training model function : DQN
- single_trainer = function(model,memory,target_model,batch_size,discount,qtable){
- #define sample(mini_batch) and empty states
- mini_batch = memory %>% sample_n(batch_size)
- #saving values : SARS
- states = mini_batch[,1:2] #states vars
- names(states) = c('x','y')
- actions = mini_batch %>% select(action)
- rewards = mini_batch %>% select(reward)
- next_states = mini_batch %>% select(grep('next_state',names(.)))
- names(next_states) = c('x','y')
- #get dones
- dones = mini_batch %>% select(done)
- #make q(s) and q(s')
- target = predict(model,data.matrix(states))
- target_val = predict(target_model,data.matrix(next_states))
- #Bellman equation for updating target
- for(i in 1:batch_size){
- ifelse(dones[i,],
- target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,], # when process is done, column index for given action
- #update target by bellman equation, column index for given action
- # using DDQN method : R_t+1 + discount*Q(s', argmax(Q(s'))))
- target[i,which(c('up','down','left','right')==actions[i,])] <- rewards[i,] + discount*max(target_val[i,])
- )
- }
- return(list(states_batch = data.matrix(states),target = target)) #batch components for fit model
- }
- #======================================================================
- # DDQN Agent
- #======================================================================
- #training agent function
- double_dqn_agent = function(episode,params,grid,model_weight = NULL,double = T){
- #check trainer will be DDQN or DQN
- if(double){
- trainer = double_trainer
- }else{
- trainer = single_trainer
- }
- #claim parameters from params
- discount = params$discount
- learning_rate = params$learning_rate
- batch_size = params$batch_size
- epsilon = params$epsilon
- epsilon_decay = params$epsilon_decay
- epsilon_min = params$epsilon_min
- train_start = params$train_start
- max_memory = params$max_memory
- #calculate grid size
- grid_size = dim(grid) %>% prod
- # set memory dataframe
- memory = data.frame()
- #allow 4 actions
- actions = c('up','down','left','right')
- #claim size of action and state
- state_size = 2
- action_size = length(actions)
- #initiate model
- model = build_model(state_size,action_size,learning_rate)
- target_model = build_model(state_size,action_size,learning_rate)
- #set models' weight as same
- set_weights(target_model,get_weights(model))
- #optional : if preset for model weight exists
- if(!is.null(model_weight)){
- set_weights(model,model_weight)
- set_weights(target_model,model_weight)
- }
- #claim q table
- qtable = expand.grid(x = 1:nrow(grid),y = 1:ncol(grid),action = actions,q = 0)
- #recording train loss
- loss = c()
- #recording score
- scores = c()
- #recording success frequency
- sfreq = c()
- #episode loop
- for(i in 1:episode){
- #set first value
- done = F
- score = 0
- #reset state and get first action
- if(i %% 10 == 0){ #start at first position
- state = list(x = 1, y = 1)
- }else{ #random initiate
- state = list(x = sample(1:nrow(grid),1),y = sample(1:ncol(grid),1))
- }
- #small memory for give agent penalty
- small_memory = c()
- n_step = 0
- repeat{ #loop a single episode
- #update small memory
- small_memory = append(small_memory,state %>% paste(collapse = ''))
- #start action by given state
- action = get_action(state,epsilon,qtable,actions)
- #step by given action and state
- time_step =step(state,action)
- n_step = n_step+1
- #updating values
- reward = ifelse(time_step$reward==1,2*grid_size,time_step$reward) - 0.04 #0.04 = cost of making movement
- next_state = time_step$next_state
- score = score + reward
- done = (time_step$reward==1 | -grid_size*0.5 >= score) #finish or prevent infinite loop
- if(done==F & next_state %>% paste(collapse = '') %in% small_memory){
- reward = reward-0.9 #penalty when agent go place it went before
- }
- #append memory data
- memory = data.frame(state = state,
- action=action,
- reward=reward,
- next_state = next_state,
- done=done) %>%
- rbind(memory,.)
- #keep memory in max_memory size
- if(nrow(memory)>max_memory){
- memory = memory[-1,]
- }
- #training model
- if(nrow(memory)>=train_start){ # when memory has enough data
- #update epsilon
- epsilon = ifelse(epsilon > epsilon_min,epsilon*epsilon_decay,epsilon)
- #run training model and get loss
- fit_vars = trainer(model,memory,target_model,batch_size,discount,qtable)
- model %>% fit(fit_vars$states_batch, fit_vars$target,
- batch_size, epochs=1, verbose=0) #training
- train.loss = model$history$history$loss[[1]]
- }
- #update qtable
- qtable = predict(model,expand.grid(x=1:nrow(grid),y=1:ncol(grid)) %>% data.matrix) %>%
- data.frame %>% cbind(expand.grid(x=1:nrow(grid),y=1:ncol(grid)),.)
- colnames(qtable)[-(1:2)] = actions
- qtable = melt(qtable,c('x','y'))
- colnames(qtable)[3:4] = c('action','q')
- #update and state
- state = next_state
- if(done){ #when episode is done
- set_weights(target_model,get_weights(model))#update target model by weight of model
- loss[i] = ifelse(exists('train.loss'),train.loss,NA)
- scores[i] = score
- small_memory = append(small_memory,state %>% paste(collapse = '')) #update small memory
- break
- }
- } #loop
- #update srtae and check
- sfreq[i] = ifelse(time_step$reward==1,1,0)
- cat('Episode ',i,'is done \n',
- 'Epsilon :',epsilon,' \n',
- 'Score :',score,' \n',
- 'Loss :',loss[i],' \n',
- 'Memory size :',nrow(memory),' \n',
- 'Steps :',n_step,' \n',
- 'Result :',ifelse(time_step$reward==1,'Success','Fail'),' \n',
- 'Winning rate(10)',ifelse(length(sfreq)<10,'-Less than 10',sum(tail(sfreq,10))/10),'\n',
- 'Overall Winning rate',sum(sfreq)/i,'\n',
- '----------- \n') #message that episode is done
- if(sum(tail(sfreq,20))/20 >=0.95){
- cat("Last 20 cases' Winning rate is more than 95% : stop training \n")
- break
- }
- }#episode end
- return(list(loss = loss,score = scores,model = model,target_model = target_model,
- last_moveset = small_memory))
- }#function end
- #======================================================================
- # Visualization
- #======================================================================
- #draw policy by given model
- draw_policy_model = function(grid,qmodel,method){
- #mapping destination and obstacle
- obstacle = which(grid==-1,arr.ind = T)
- destination = which(grid==1,arr.ind = T)
- #allow 5 actions
- actions = c('up','down','left','right')
- #initiate start point and qtable
- state = list(x= 1,y = 1)
- qpolicy = predict(qmodel,expand.grid(state.x = 1:nrow(grid),state.y = 1:ncol(grid)) %>% data.matrix) %>%
- as.data.frame %>% mutate(action = actions[max.col(.)]) %>% select(action) %>%
- cbind(expand.grid(x = 1:nrow(grid),y = 1:ncol(grid)),.)
- #do action
- action = (qpolicy %>% filter(x==state$x,y==state$y))$action
- if(action == 'up' | action == 'left'){
- stop('Your policy ask agent to do illegal movement : stop')
- }
- #make record dataframe
- record = data.frame(x=state$x,y=state$y,act = action)
- #do recording by given q policy
- repeat{
- #update state
- res = step(state,action)
- state = res$next_state
- done = (res$reward==1)
- if(!done){
- #choose action by existing policy
- action = (qpolicy %>% filter(x==state$x,y==state$y))$action
- #check
- if(state$x==nrow(grid) & action == 'down' | state$y==ncol(grid) & action == 'right' |
- state$x==1 & action == 'up' | state$y==1 & action == 'left'){
- stop('Your policy ask agent to do illegal movement : stop')
- }
- #update record
- record = rbind(record,data.frame(x=state$x,y=state$y,act = action))
- #chekc policy ask to loop or not
- if(TRUE %in% duplicated(record)){
- stop('Your Policy ask agent to go visted area : This might cause infinite loop. stop')
- }
- }else{
- #update record
- record = rbind(record,data.frame(x=state$x,y=state$y,act = action))
- break
- }
- }#end loop
- #visualization : make empty grid object
- grid_plot = expand.grid(x=1:(ncol(grid)+1),y=1:(nrow(grid)+1)) %>% ggplot(aes(x,y))+
- geom_point(alpha=0)+
- geom_vline(xintercept = 1:(ncol(grid)+1))+
- geom_hline(yintercept = 1:(nrow(grid)+1))+
- ggtitle('Grid World Policy Viewer',
- subtitle = paste(method,'result'))+
- theme_bw()
- #add obstacle and destination
- for(i in 1:nrow(obstacle)){
- x_add = obstacle[i,2]+0.5
- y_add = (nrow(grid)+1)-obstacle[i,1]+0.5
- grid_plot = grid_plot +
- geom_point(aes(x = !!x_add, y= !!y_add),shape = 2,color = 'dark green',size = 2)
- }
- #add destination
- grid_plot = grid_plot +
- geom_point(aes(x = destination[1,2]+0.5,
- y= (nrow(grid)+1)-destination[1,1]+0.5),
- shape = 9,color = 'red',size = 2.5)
- #update empty plot
- for(i in 1:nrow(record)){
- x_add = record[i,2]+0.5
- y_add = (nrow(grid)+1)-record[i,1]+0.5
- grid_plot = grid_plot +
- geom_point(aes(x = !!x_add,y = !!y_add),
- color = 'blue',shape = 13, size = 1.8)
- }
- return(grid_plot)
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement