From bc398d80043a53cdd6989f67db32ff4c28d36a9f Mon Sep 17 00:00:00 2001 From: Victoria Krakovna Date: Mon, 12 Oct 2020 17:36:57 +0100 Subject: [PATCH] Added functionality for running on new environment variants in the future task paper PiperOrigin-RevId: 336680745 --- side_effects_penalties/run_experiment.py | 13 ++++- .../side_effects_penalty_test.py | 11 +++-- side_effects_penalties/training.py | 47 ++++++++++++++----- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/side_effects_penalties/run_experiment.py b/side_effects_penalties/run_experiment.py index a219d34..31975a5 100644 --- a/side_effects_penalties/run_experiment.py +++ b/side_effects_penalties/run_experiment.py @@ -51,6 +51,10 @@ if __name__ == '__main__': # Avoid defining flags when used as a library. flags.DEFINE_integer('seed', 1, 'Random seed.') flags.DEFINE_string('env_name', 'box', 'Environment name.') flags.DEFINE_bool('noops', True, 'Whether the environment includes noops.') + flags.DEFINE_integer('movement_reward', 0, 'Movement reward.') + flags.DEFINE_integer('goal_reward', 1, 'Reward for reaching a goal state.') + flags.DEFINE_integer('side_effect_reward', -1, + 'Hidden reward for causing side effects.') flags.DEFINE_bool('exact_baseline', False, 'Compute the exact baseline using an environment copy.') flags.DEFINE_enum('mode', 'save', ['print', 'save'], @@ -61,7 +65,8 @@ if __name__ == '__main__': # Avoid defining flags when used as a library. def run_experiment(baseline, dev_measure, dev_fun, discount, value_discount, beta, anneal, num_episodes, num_episodes_noexp, seed, - env_name, noops, exact_baseline, mode, path, suffix): + env_name, noops, exact_baseline, mode, path, suffix, + movement_reward, goal_reward, side_effect_reward): """Run agent and save or print the results.""" performances = [] rewards = [] @@ -75,7 +80,8 @@ def run_experiment(baseline, dev_measure, dev_fun, discount, value_discount, anneal=anneal, num_episodes=num_episodes, num_episodes_noexp=num_episodes_noexp, seed=seed, env_name=env_name, noops=noops, agent_class=agent_with_penalties.QLearningSE, - exact_baseline=exact_baseline) + exact_baseline=exact_baseline, movement_reward=movement_reward, + goal_reward=goal_reward, side_effect_reward=side_effect_reward) rewards.extend(reward) performances.extend(performance) seeds.extend([seed] * (num_episodes + num_episodes_noexp)) @@ -117,6 +123,9 @@ def main(unused_argv): seed=FLAGS.seed, env_name=FLAGS.env_name, noops=FLAGS.noops, + movement_reward=FLAGS.movement_reward, + goal_reward=FLAGS.goal_reward, + side_effect_reward=FLAGS.side_effect_reward, exact_baseline=FLAGS.exact_baseline, mode=FLAGS.mode, path=FLAGS.path, diff --git a/side_effects_penalties/side_effects_penalty_test.py b/side_effects_penalties/side_effects_penalty_test.py index 2833dec..2468216 100644 --- a/side_effects_penalties/side_effects_penalty_test.py +++ b/side_effects_penalties/side_effects_penalty_test.py @@ -44,8 +44,8 @@ class SideEffectsTestCase(parameterized.TestCase): class BaselineTestCase(SideEffectsTestCase): def _create_baseline(self, env_name): - self._env = training.get_env(env_name, True) - self._baseline_env = training.get_env(env_name, True) + self._env, _ = training.get_env(env_name, True) + self._baseline_env, _ = training.get_env(env_name, True) baseline_class = getattr(side_effects_penalty, self.__class__.__name__[:-4]) # remove 'Test' self._baseline = baseline_class( @@ -84,7 +84,8 @@ class StartBaselineTest(BaselineTestCase): class InactionBaselineTest(BaselineTestCase): - box_action_spec = training.get_env('box', True).action_spec() + box_env, _ = training.get_env('box', True) + box_action_spec = box_env.action_spec() @parameterized.parameters( *list(range(box_action_spec.minimum, box_action_spec.maximum + 1))) @@ -191,7 +192,7 @@ class NoDeviationTest(SideEffectsTestCase): def _random_initial_transition(self): env_name = np.random.choice(environments) noops = np.random.choice([True, False]) - env = training.get_env(env_name, noops) + env, _ = training.get_env(env_name, noops) action_range = self._env_to_action_range(env) action = np.random.choice(action_range) state1 = self._timestep_to_state(env.reset()) @@ -216,7 +217,7 @@ class UnreachabilityTest(SideEffectsTestCase): def testUnreachabilityCycle(self, gamma): # Reachability with no dev_fun means unreachability deviation = side_effects_penalty.Reachability(value_discount=gamma) - env = training.get_env('box', False) + env, _ = training.get_env('box', False) state0 = self._timestep_to_state(env.reset()) state1 = self._timestep_to_state(env.step(Actions.LEFT)) diff --git a/side_effects_penalties/training.py b/side_effects_penalties/training.py index e6fbfe2..89a016b 100644 --- a/side_effects_penalties/training.py +++ b/side_effects_penalties/training.py @@ -23,16 +23,25 @@ import numpy as np from six.moves import range -def get_env(env_name, noops): +def get_env(env_name, noops, + movement_reward=-1, goal_reward=1, side_effect_reward=-1): """Get a copy of the environment for simulating the baseline.""" - if env_name == 'box': - env = factory.get_environment_obj('side_effects_sokoban', noops=noops) - elif env_name in ['vase', 'sushi', 'sushi_goal']: + if env_name == 'box' or 'sokocoin' in env_name: + levels = {'box': 0, 'sokocoin1': 1, 'sokocoin2': 2, 'sokocoin3': 3} + sizes = {'box': 36, 'sokocoin1': 100, 'sokocoin2': 72, 'sokocoin3': 100} env = factory.get_environment_obj( - 'conveyor_belt', variant=env_name, noops=noops) + 'side_effects_sokoban', noops=noops, movement_reward=movement_reward, + goal_reward=goal_reward, wall_reward=side_effect_reward, + corner_reward=side_effect_reward, level=levels[env_name]) + size = sizes[env_name] + elif 'sushi' in env_name or env_name == 'vase': + env = factory.get_environment_obj( + 'conveyor_belt', variant=env_name, noops=noops, goal_reward=goal_reward) + size = 49 else: env = factory.get_environment_obj(env_name) - return env + size = None + return env, size def run_loop(agent, env, number_episodes, anneal): @@ -63,8 +72,9 @@ def run_loop(agent, env, number_episodes, anneal): def run_agent(baseline, dev_measure, dev_fun, discount, value_discount, beta, - anneal, seed, env_name, noops, num_episodes, num_episodes_noexp, - exact_baseline, agent_class): + exact_baseline, anneal, num_episodes, num_episodes_noexp, seed, + env_name, noops, movement_reward, goal_reward, side_effect_reward, + agent_class): """Run agent. Create an agent with the given parameters for the side effects penalty. @@ -79,14 +89,17 @@ def run_agent(baseline, dev_measure, dev_fun, discount, value_discount, beta, discount: discount factor value_discount: discount factor for deviation measure value function. beta: weight for side effects penalty + exact_baseline: whether to use an exact or approximate baseline anneal: whether to anneal the exploration rate from 1 to 0 or use a constant exploration rate + num_episodes: number of episodes + num_episodes_noexp: number of episodes with no exploration seed: random seed env_name: environment name noops: whether the environment has noop actions - num_episodes: number of episodes - num_episodes_noexp: number of episodes with no exploration - exact_baseline: whether to use an exact or approximate baseline + movement_reward: movement reward + goal_reward: reward for reaching a goal state + side_effect_reward: hidden reward for causing side effects agent_class: Q-learning agent class: QLearning (regular) or QLearningSE (with side effects penalty) @@ -95,10 +108,18 @@ def run_agent(baseline, dev_measure, dev_fun, discount, value_discount, beta, performances: safety performance for each episode """ np.random.seed(seed) - env = get_env(env_name, noops) + env, _ = get_env(env_name=env_name, + noops=noops, + movement_reward=movement_reward, + goal_reward=goal_reward, + side_effect_reward=side_effect_reward) start_timestep = env.reset() if exact_baseline: - baseline_env = get_env(env_name, True) + baseline_env, _ = get_env(env_name=env_name, + noops=True, + movement_reward=movement_reward, + goal_reward=goal_reward, + side_effect_reward=side_effect_reward) else: baseline_env = None agent = agent_class(