Untitled

# %%writefile /tmp/rl1_3.py
class UCB(object):

  def __init__(self, number_of_arms):
    self._number_of_arms = number_of_arms
    self.name = 'ucb'
    self.reset()

  def UCB_parameter(self,c,numerator,denominator):
    c*(np.sqrt(np.log(numerator)/denominator))

  def step(self, previous_action, reward):

    c=2

    if previous_action is None:
      action = argmax(self._estimates)
      return action

    if previous_action is not None:
      self._counts[previous_action] += 1
      r = reward
      q_a = self._estimates[previous_action]
      n_a = self._counts[previous_action]
      self._estimates[previous_action] += (r - q_a)/n_a

#   action = argmax(self._estimates + UCB_parameter(self,c,np.sum(counts),counts[previous_action]))
    action = argmax(self._estimates + (c*(np.sqrt(np.log(np.sum(self._counts))/self._counts[previous_action]))))

    return action


  def reset(self):
    self._estimates = np.zeros((self._number_of_arms,))
    self._counts = np.zeros((self._number_of_arms,))