mirror of
https://github.com/google-deepmind/deepmind-research.git
synced 2026-05-09 21:07:49 +08:00
9e3d04c867
PiperOrigin-RevId: 272089371
131 lines
5.1 KiB
Python
131 lines
5.1 KiB
Python
# Copyright 2019 DeepMind Technologies Limited.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ============================================================================
|
|
"""Run a Q-learning agent with a side effects penalty."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from absl import app
|
|
from absl import flags
|
|
import pandas as pd
|
|
from six.moves import range
|
|
from six.moves import zip
|
|
from side_effects_penalties import agent_with_penalties
|
|
from side_effects_penalties import training
|
|
from side_effects_penalties.file_loading import filename
|
|
|
|
|
|
FLAGS = flags.FLAGS
|
|
|
|
if __name__ == '__main__': # Avoid defining flags when used as a library.
|
|
flags.DEFINE_enum('baseline', 'stepwise',
|
|
['start', 'inaction', 'stepwise', 'step_noroll'],
|
|
'Baseline.')
|
|
flags.DEFINE_enum('dev_measure', 'rel_reach',
|
|
['none', 'reach', 'rel_reach', 'att_util'],
|
|
'Deviation measure.')
|
|
flags.DEFINE_enum('dev_fun', 'truncation', ['truncation', 'absolute'],
|
|
'Summary function for the deviation measure.')
|
|
flags.DEFINE_float('discount', 0.99, 'Discount factor for rewards.')
|
|
flags.DEFINE_float('value_discount', 0.99,
|
|
'Discount factor for deviation measure value function.')
|
|
flags.DEFINE_float('beta', 30.0, 'Weight for side effects penalty.')
|
|
flags.DEFINE_bool('anneal', True,
|
|
'Whether to anneal the exploration rate from 1 to 0.')
|
|
flags.DEFINE_integer('num_episodes', 10000, 'Number of episodes.')
|
|
flags.DEFINE_integer('num_episodes_noexp', 0,
|
|
'Number of episodes with no exploration.')
|
|
flags.DEFINE_integer('seed', 1, 'Random seed.')
|
|
flags.DEFINE_string('env_name', 'box', 'Environment name.')
|
|
flags.DEFINE_bool('noops', True, 'Whether the environment includes noops.')
|
|
flags.DEFINE_bool('exact_baseline', False,
|
|
'Compute the exact baseline using an environment copy.')
|
|
flags.DEFINE_enum('mode', 'save', ['print', 'save'],
|
|
'Print results or save to file.')
|
|
flags.DEFINE_string('path', '', 'File path.')
|
|
flags.DEFINE_string('suffix', '', 'Filename suffix.')
|
|
|
|
|
|
def run_experiment(baseline, dev_measure, dev_fun, discount, value_discount,
|
|
beta, anneal, num_episodes, num_episodes_noexp, seed,
|
|
env_name, noops, exact_baseline, mode, path, suffix):
|
|
"""Run agent and save or print the results."""
|
|
performances = []
|
|
rewards = []
|
|
seeds = []
|
|
episodes = []
|
|
if dev_measure not in ['rel_reach', 'att_util']:
|
|
dev_fun = 'none'
|
|
reward, performance = training.run_agent(
|
|
baseline=baseline, dev_measure=dev_measure, dev_fun=dev_fun,
|
|
discount=discount, value_discount=value_discount, beta=beta,
|
|
anneal=anneal, num_episodes=num_episodes,
|
|
num_episodes_noexp=num_episodes_noexp, seed=seed, env_name=env_name,
|
|
noops=noops, agent_class=agent_with_penalties.QLearningSE,
|
|
exact_baseline=exact_baseline)
|
|
rewards.extend(reward)
|
|
performances.extend(performance)
|
|
seeds.extend([seed] * (num_episodes + num_episodes_noexp))
|
|
episodes.extend(list(range(num_episodes + num_episodes_noexp)))
|
|
if mode == 'save':
|
|
d = {'reward': rewards, 'performance': performances,
|
|
'seed': seeds, 'episode': episodes}
|
|
df = pd.DataFrame(d)
|
|
df1 = add_smoothed_data(df)
|
|
f = filename(env_name, noops, dev_measure, dev_fun, baseline, beta,
|
|
value_discount, path=path, suffix=suffix, seed=seed)
|
|
df1.to_csv(f)
|
|
return reward, performance
|
|
|
|
|
|
def _smooth(values, window=100):
|
|
return values.rolling(window,).mean()
|
|
|
|
|
|
def add_smoothed_data(df, groupby='seed', window=100):
|
|
grouped = df.groupby(groupby)[['reward', 'performance']]
|
|
grouped = grouped.apply(_smooth, window=window).rename(columns={
|
|
'performance': 'performance_smooth', 'reward': 'reward_smooth'})
|
|
temp = pd.concat([df, grouped], axis=1)
|
|
return temp
|
|
|
|
|
|
def main(unused_argv):
|
|
reward, performance = run_experiment(
|
|
baseline=FLAGS.baseline,
|
|
dev_measure=FLAGS.dev_measure,
|
|
dev_fun=FLAGS.dev_fun,
|
|
discount=FLAGS.discount,
|
|
value_discount=FLAGS.value_discount,
|
|
beta=FLAGS.beta,
|
|
anneal=FLAGS.anneal,
|
|
num_episodes=FLAGS.num_episodes,
|
|
num_episodes_noexp=FLAGS.num_episodes_noexp,
|
|
seed=FLAGS.seed,
|
|
env_name=FLAGS.env_name,
|
|
noops=FLAGS.noops,
|
|
exact_baseline=FLAGS.exact_baseline,
|
|
mode=FLAGS.mode,
|
|
path=FLAGS.path,
|
|
suffix=FLAGS.suffix)
|
|
if FLAGS.mode == 'print':
|
|
print('Performance and reward in the last 10 steps:')
|
|
print(list(zip(performance, reward))[-10:-1])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
app.run(main)
|