Untitled

#num_eval_rollouts=1000
num_epochs = 200
# loop over runouts
returns=[]
for i in range(1,num_epochs, 30):
    # reset variables in each iteration
    totalr=0
    done=False
    obs=env.reset()
    print(i)
    cloner.train(epochs = i)
    print(i)
    # loop until the end of the episode
    while not done:
        # get an action from the trained model (output for specific observation as input)
        action=cloner.sample(obs)
        # implement the action
        obs,r,done,_=env.step(action)
        env.render()
        totalr+=r
    returns.append(totalr)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))