Open sourcing side effects code

PiperOrigin-RevId: 272089371
2026-05-09 21:07:49 +08:00 · 2019-10-01 00:17:21 +01:00
parent 60655a2797
commit 9e3d04c867
12 changed files with 1916 additions and 0 deletions
@@ -0,0 +1,150 @@
+# Copyright 2019 DeepMind Technologies Limited.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Vanilla Q-Learning agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+from six.moves import range
+
+
+class EpsilonGreedyPolicy(object):
+  """Epsilon greedy policy for table value function lookup."""
+
+  def __init__(self, value_function, actions):
+    """Construct an epsilon greedy policy object.
+
+    Args:
+      value_function: agent value function as a dict.
+      actions: list of possible actions.
+
+    Raises:
+      ValueError: if `actions` agument is not an iterable.
+    """
+    if not isinstance(actions, collections.Iterable):
+      raise ValueError('`actions` argument must be an iterable.')
+
+    self._value_function = value_function
+    self._actions = actions
+
+  def get_action(self, epsilon, state):
+    """Get action following the e-greedy policy.
+
+    Args:
+      epsilon: probability of selecting a random action
+      state: current state of the game as a state/action tuple.
+
+    Returns:
+      Chosen action.
+    """
+    if np.random.random() < epsilon:
+      return np.random.choice(self._actions)
+    else:
+      values = [self._value_function[(state, action)]
+                for action in self._actions]
+
+      max_value = max(values)
+      max_indices = [i for i, value in enumerate(values) if value == max_value]
+
+      return self._actions[np.random.choice(max_indices)]
+
+
+class QLearning(object):
+  """Q-learning agent."""
+
+  def __init__(self, actions, alpha=0.1, epsilon=0.1, q_initialisation=0.0,
+               discount=0.99):
+    """Create a Q-learning agent.
+
+    Args:
+      actions: a BoundedArraySpec that specifes full discrete action spec.
+      alpha: agent learning rate.
+      epsilon: agent exploration rate.
+      q_initialisation: float, used to initialise the value function.
+      discount: discount factor for rewards.
+    """
+
+    self._value_function = collections.defaultdict(lambda: q_initialisation)
+    self._valid_actions = list(range(actions.minimum, actions.maximum + 1))
+    self._policy = EpsilonGreedyPolicy(self._value_function,
+                                       self._valid_actions)
+
+    # Hyperparameters.
+    self.alpha = alpha
+    self.epsilon = epsilon
+    self.discount = discount
+
+    # Episode internal variables.
+    self._current_action = None
+    self._current_state = None
+
+  def begin_episode(self):
+    """Perform episode initialisation."""
+    self._current_state = None
+    self._current_action = None
+
+  def _timestep_to_state(self, timestep):
+    return tuple(map(tuple, np.copy(timestep.observation['board'])))
+
+  def step(self, timestep):
+    """Perform a single step in the environment."""
+    # Get state observations.
+    state = self._timestep_to_state(timestep)
+
+    # This is one of the follow up states (i.e. not the initial state).
+    if self._current_state is not None:
+      self._update(timestep, state)
+
+    self._current_state = state
+    # Determine action.
+    self._current_action = self._policy.get_action(self.epsilon, state)
+    # Emit action.
+    return self._current_action
+
+  def _calculate_reward(self, timestep, unused_state):
+    """Calculate reward: to be extended when impact penalty is added."""
+    reward = timestep.reward
+    return reward
+
+  def _update(self, timestep, state):
+    """Perform value function update."""
+
+    reward = self._calculate_reward(timestep, state)
+
+    # Terminal state.
+    if not state:
+      delta = (reward - self._value_function[(self._current_state,
+                                              self._current_action)])
+    # Non-terminal state.
+    else:
+      max_action = self._policy.get_action(0, state)
+      delta = (
+          reward + self.discount * self._value_function[(state, max_action)] -
+          self._value_function[(self._current_state, self._current_action)])
+
+    self._value_function[(self._current_state,
+                          self._current_action)] += self.alpha * delta
+
+  def end_episode(self, timestep):
+    """Performs episode cleanup."""
+    # Update for the terminal state.
+    self._update(timestep, None)
+
+  @property
+  def value_function(self):
+    return self._value_function