From 8899ec087cbc36c10f4e9d6dfd369b46e4dcfff9 Mon Sep 17 00:00:00 2001 From: Khimya Date: Thu, 18 Jun 2020 14:31:31 -0400 Subject: [PATCH] affordances in discrete env --- .../AffordancesInDiscreteEnvironment.ipynb | 2449 +++++++++++++++++ affordances_theory/README.md | 7 +- 2 files changed, 2454 insertions(+), 2 deletions(-) create mode 100644 affordances_theory/AffordancesInDiscreteEnvironment.ipynb diff --git a/affordances_theory/AffordancesInDiscreteEnvironment.ipynb b/affordances_theory/AffordancesInDiscreteEnvironment.ipynb new file mode 100644 index 0000000..ac982be --- /dev/null +++ b/affordances_theory/AffordancesInDiscreteEnvironment.ipynb @@ -0,0 +1,2449 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "AffordancesInDiscreteEnvironment.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AXY3q8CdsGAR", + "colab_type": "text" + }, + "source": [ + "Copyright 2020 \"What Can I do Here? A Theory of Affordances In Reinforcement Learning\" Authors. All rights reserved." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lJB5fq1g81z2", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 343 + }, + "outputId": "83ca2438-9624-4a09-c936-3457a1b1ab9c" + }, + "source": [ + "#@title Click to Install and import libraries.\n", + "# Please follow the instructions in the README for dependencies installation.\n", + "!git clone https://github.com/kkhetarpal/emdp.git\n", + "%cd emdp/\n", + "!pip install -e .\n", + "!pip install matplotlib==3.0.2\n", + "\n", + "#@title General Imports\n", + "%tensorflow_version 2.x\n", + "from collections import defaultdict\n", + "import enum\n", + "import functools\n", + "import os\n", + "import random\n", + "import sys\n", + "import time\n", + "from datetime import datetime\n", + "\n", + "import emdp\n", + "from emdp import actions\n", + "from emdp.gridworld import GridWorldPlotter\n", + "from emdp.gridworld import build_simple_grid\n", + "from emdp.gridworld.builder_tools import TransitionMatrixBuilder\n", + "from emdp.gridworld.env import GridWorldMDP\n", + "from emdp.gridworld.helper_utilities import get_state_after_executing_action\n", + "from emdp.gridworld.helper_utilities import check_can_take_action\n", + "from emdp.gridworld.helper_utilities import get_possible_actions\n", + "from emdp.gridworld.helper_utilities import get_state_after_executing_action\n", + "from emdp.gridworld.txt_utilities import get_char_matrix\n", + "from emdp.gridworld.txt_utilities import build_gridworld_from_char_matrix\n", + "from emdp.examples.simple import build_four_rooms_example\n", + "from emdp.utils import convert_int_rep_to_onehot, convert_onehot_to_int\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.animation as animation\n", + "import numpy as np\n", + "from numpy.linalg import inv\n", + "\n", + "import seaborn as sns\n", + "import tensorflow as tf\n", + "\n", + "color_ls = [[102, 120, 173],\n", + " [118, 167, 125], \n", + " [198, 113, 113], \n", + " [230, 169, 132],\n", + " [169, 193, 213],\n", + " [192, 197, 182],\n", + " [210, 180, 226]]\n", + "colors = [[shade / 255.0 for shade in rgb] for rgb in color_ls]\n", + "markers = ['o', 's', 'D', '^', '*', 'x', 'p', '+', 'v','|']\n", + "\n", + "DEFAULT_ARROW_COLOR = '#a65628'" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "fatal: destination path 'emdp' already exists and is not an empty directory.\n", + "/content/emdp\n", + "Obtaining file:///content/emdp\n", + "Requirement already satisfied: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from emdp==0.0.4) (1.18.5)\n", + "Installing collected packages: emdp\n", + " Found existing installation: emdp 0.0.4\n", + " Can't uninstall 'emdp'. No files were found to uninstall.\n", + " Running setup.py develop for emdp\n", + "Successfully installed emdp\n", + "Requirement already satisfied: matplotlib==3.0.2 in /usr/local/lib/python3.6/dist-packages (3.0.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib==3.0.2) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib==3.0.2) (1.2.0)\n", + "Requirement already satisfied: numpy>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from matplotlib==3.0.2) (1.18.5)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib==3.0.2) (2.8.1)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib==3.0.2) (2.4.7)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib==3.0.2) (1.12.0)\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", + " import pandas.util.testing as tm\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gDpLK58xElDQ", + "colab_type": "text" + }, + "source": [ + "# Helper functions" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y6_lV_vUIhs3", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Plotting and verifying matrices.\n", + "def plot_environment(\n", + " mdp, ax, wall_locs=None, plot_grid=False,\n", + " grid_kwargs=None,\n", + " wall_color=(0, 0, 0, 1), # R, G, B, alpha\n", + " ):\n", + " \"\"\"Function to plot emdp environment\n", + "\n", + " Args:\n", + " mdp: The MDP to use.\n", + " ax: The axes to plot this on.\n", + " wall_locs: Locations of the walls for plotting them in a different color.\n", + " plot_grid: Boolean indicating if the overlay grid should be plotted.\n", + " grid_kwargs: Grid keyword argrument specification.\n", + " wall_color: RGB color of the walls.\n", + "\n", + " Returns:\n", + " ax: The axes of the final plot.\n", + " imshow_ax: The final plot.\n", + " \"\"\"\n", + " grid_kwargs = grid_kwargs or {}\n", + "\n", + " # Plot states with white background.\n", + " state_background = np.ones((mdp.size, mdp.size))\n", + "\n", + " # Walls appear in a different color.\n", + " wall_img = np.ones((mdp.size, mdp.size, 4))\n", + " if wall_locs is not None:\n", + " for state in wall_locs:\n", + " y_coord = state[0]\n", + " x_coord = state[1]\n", + " wall_img[y_coord, x_coord, :] = np.array(wall_color)\n", + "\n", + " # Render the heatmap and overlay the walls.\n", + " imshow_ax = ax.imshow(state_background, interpolation=None)\n", + " imshow_ax = ax.imshow(wall_img, interpolation=None)\n", + " ax.grid(False)\n", + " \n", + " # Switch on flag if you want to plot grid \n", + " if plot_grid:\n", + " for i in range(mdp.size + 1):\n", + " ax.plot(\n", + " np.arange(mdp.size + 1) - 0.5,\n", + " np.ones(mdp.size + 1) * i - 0.5,\n", + " **grid_kwargs)\n", + "\n", + " for i in range(mdp.size + 1):\n", + " ax.plot(\n", + " np.ones(mdp.size + 1) * i - 0.5,\n", + " np.arange(mdp.size + 1) - 0.5,\n", + " **grid_kwargs)\n", + " ax.set_xlabel('x')\n", + " ax.set_ylabel('y')\n", + "\n", + " return ax, imshow_ax\n", + "\n", + "\n", + "def get_current_state_integer(state_):\n", + " return np.argmax(state_, axis=0)\n", + "\n", + "\n", + "def get_stateid(x, y, size):\n", + " \"\"\"Converts an (x, y) coordinate into the state id.\"\"\"\n", + " return size * x + y\n", + "\n", + "\n", + "def _is_absorbing(state_int, mdp_size):\n", + " \"\"\"Checks if the state_int is an absorbing state\"\"\"\n", + " return state_int == mdp_size * mdp_size\n", + "\n", + "\n", + "def _checking_P(P):\n", + " \"\"\"Checks if the P matrix is valid.\"\"\"\n", + " assert np.all(P <= 1.0) and np.all(P >= 0.0)\n", + " assert not np.allclose(P, 1.0)\n", + " assert not np.allclose(P, 0.0)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I9SJJN8aTuKL", + "colab_type": "text" + }, + "source": [ + "## Grid World Environment" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bmdjTf1-Dd4a", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 332 + }, + "outputId": "8df6e3ee-a090-4a96-8f42-96e31bdfeed5" + }, + "source": [ + "#@title Create the one room example.\n", + "_ONE_ROOM_TXT = \"\"\"#############\n", + "# #\n", + "# g #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# ####### #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "#############\"\"\".split('\\n')\n", + "\n", + "\n", + "def build_one_room_example(gamma=0.99, seed=2017, p_success=1.0):\n", + " char_matrix = get_char_matrix(_ONE_ROOM_TXT)\n", + " return build_gridworld_from_char_matrix(\n", + " char_matrix, p_success=p_success, seed=seed, gamma=gamma)\n", + "\n", + "\n", + "mdp, mdp_wall_locs = build_one_room_example()\n", + "\n", + "gwp = GridWorldPlotter.from_mdp(mdp)\n", + "\n", + "fig = plt.figure(figsize=(10, 4))\n", + "ax = fig.add_subplot(121)\n", + "\n", + "plot_environment(mdp, ax, wall_locs=mdp_wall_locs, plot_grid=True, \n", + " grid_kwargs={'color':(220 / 255, 220 / 255, 220 / 255, 0.8)})\n" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(,\n", + " )" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zui4ITyyAjFY", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "# @title Functions for building transition matrices and affordance grids.\n", + "\n", + "def build_simple_grid_stochastic_states(\n", + " size=5, terminal_states=[], p_success=1):\n", + " \"\"\"\n", + " Builds a simple grid where an agent can move LEFT, RIGHT, UP or DOWN\n", + " and actions success with probability p_success.\n", + " A terminal state is added if len(terminal_states) > 0 and will return matrix\n", + " of size (|S|+1)x|A|x(|S|+1). Moving into walls does nothing.\n", + "\n", + " Note that this is a function modified from emdp package to customise mdp\n", + " transitions.\n", + "\n", + " Args:\n", + " size: size of the grid world\n", + " terminal_state: the location of terminal states: a list of (x, y) tuples\n", + " p_success: the probabilty that an action will be successful.\n", + "\n", + " Returns:\n", + " P: transition matrix of size (|S|+1)x|A|x(|S|+1).\n", + " \"\"\"\n", + " n_actions = 4\n", + " p_fail = 1 - p_success\n", + "\n", + " n_states = size * size\n", + " # The number of entries in the state vector corresponding to grid itself.\n", + " grid_states = n_states\n", + " # Add an entry to state vector for terminal state.\n", + " if len(terminal_states) > 0:\n", + " n_states += 1\n", + " terminal_states = list(\n", + " map(lambda tupl: int(size * tupl[0] + tupl[1]), terminal_states))\n", + "\n", + " # this helper function creates the state transition list for\n", + " # taking an action in a state\n", + " def create_state_list_for_action(state_idx, action, p_success):\n", + " p_success = p_success\n", + " p_fail = 1 - p_success\n", + " transition_probs = np.zeros(n_states)\n", + " if state_idx in terminal_states:\n", + " # no matter what action you take you should go to the absorbing state\n", + " transition_probs[-1] = 1\n", + " elif state_idx == n_states - 1 and len(terminal_states) > 0:\n", + " # absorbing state, you should just transition back here whatever action you take.\n", + " transition_probs[-1] = 1\n", + "\n", + " elif action in [actions.LEFT, actions.RIGHT, actions.UP, actions.DOWN]:\n", + " # valid action, now see if we can actually execute this action\n", + " # in this state:\n", + " # TODO: distinguish between capability of slipping and taking wrong action vs failing to execute action.\n", + " if check_can_take_action(action, state_idx, size):\n", + " # yes we can\n", + " possible_actions = get_possible_actions(state_idx, size)\n", + " if action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " action, state_idx, size)] = p_success\n", + " possible_actions.remove(action)\n", + " for other_action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " other_action, state_idx, size)] = p_fail / len(possible_actions)\n", + "\n", + " else:\n", + " possible_actions = get_possible_actions(state_idx, size)\n", + " transition_probs[\n", + " state_idx] = p_success # cant take action, stay in same place\n", + " for other_action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " other_action, state_idx, size)] = p_fail / len(possible_actions)\n", + "\n", + " else:\n", + " raise InvalidActionError(\n", + " 'Invalid action {} in the 2D gridworld'.format(action))\n", + " return transition_probs\n", + "\n", + " P = np.zeros((n_states, n_actions, n_states))\n", + " for s in range(n_states):\n", + " for a in range(n_actions):\n", + " # MDP states are considered stochastic such that the probability\n", + " # of success of a s,a pair is different for diff states,\n", + " # in particular between the range of [0.1, 1.0]\n", + " p_success = random.uniform(0.10, 1.0)\n", + " P[s, a, :] = create_state_list_for_action(s, a, p_success)\n", + "\n", + " return P\n", + "\n", + "def _unit_test_P(P):\n", + " assert np.allclose(P.sum(axis=2), 1), 'P matrix is not stochastic!'\n", + "\n", + "\n", + "def build_affordance_grid(\n", + " affordances, size=13, p_success=1.0, terminal_states=()):\n", + " \"\"\"Builds a backbone for intent induced mdp transition matrix\n", + "\n", + " Args:\n", + " affordances: AF of shape |S| * |A|\n", + " size: The grid size of the mdp.\n", + " p_success: The probability of success for a transition.\n", + " mdp: Environment specified as mdp.\n", + " terminal_states: List of terminal states.\n", + "\n", + " Returns:\n", + " Returns model dynamics array of shape |S| x |A| x |S|\n", + " \"\"\"\n", + " p_fail = 1 - p_success\n", + "\n", + " n_states = size * size\n", + " grid_states = n_states # the number of entries of the state vector\n", + " # corresponding to the grid itself.\n", + " if len(terminal_states) > 0:\n", + " n_states += 1 # add an entry to state vector for terminal state\n", + " terminal_states = list(\n", + " map(lambda tupl: int(size * tupl[0] + tupl[1]), terminal_states))\n", + "\n", + " def create_state_list_for_action(state_idx, action):\n", + " transition_probs = np.zeros(n_states)\n", + " if state_idx in terminal_states:\n", + " # no matter what action you take you should go to the absorbing state\n", + " transition_probs[-1] = 1\n", + " elif state_idx == n_states - 1 and len(terminal_states) > 0:\n", + " # absorbing state, you should just transition back here whatever action you take.\n", + " transition_probs[-1] = 1\n", + "\n", + " elif action in [actions.LEFT, actions.RIGHT, actions.UP, actions.DOWN]:\n", + " # valid action, now see if we can actually execute this action\n", + " # in this state:\n", + " if check_can_take_action(action, state_idx, size):\n", + " # yes we can\n", + " possible_actions = get_possible_actions(state_idx, size)\n", + " if action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " action, state_idx, size)] = p_success\n", + " possible_actions.remove(action)\n", + " for other_action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " other_action, state_idx, size)] = p_fail / len(possible_actions)\n", + "\n", + " else:\n", + " possible_actions = get_possible_actions(state_idx, size)\n", + " transition_probs[\n", + " state_idx] = p_success # cant take action, stay in same place\n", + " for other_action in possible_actions:\n", + " transition_probs[get_state_after_executing_action(\n", + " other_action, state_idx, size)] = p_fail / len(possible_actions)\n", + " else:\n", + " raise InvalidActionError(\n", + " 'Invalid action {} in the 2D gridworld'.format(action))\n", + " return transition_probs\n", + "\n", + " n_states = size * size\n", + " n_actions = 4\n", + " if len(terminal_states) > 0:\n", + " n_states += 1 # add an entry to state vector for terminal state\n", + "\n", + " P = np.zeros((n_states, n_actions, n_states))\n", + " for s in range(n_states):\n", + " for a in range(n_actions):\n", + " if affordances[s, a] != 0.:\n", + " P[s, a, :] = create_state_list_for_action(s, a)\n", + " else:\n", + " P[s, a, s] = 1.0\n", + " return P\n", + "\n", + "\n", + "def _construct_dynamics(mdp, affordances, size, p_success=1.0, wall_locs=None):\n", + " \"\"\"\n", + " Function to construct transition dynamics P\n", + " Args:\n", + " mdp: The mdp.\n", + " affordances: The wall locations of the mdp.\n", + " size: The grid size of the mdp.\n", + " p_success: Probability of success.\n", + " wall_locs: The location of the walls in the mdp.\n", + "\n", + " Returns:\n", + " P: initialized transition matrix of shape |S| x |A| x |S|\n", + " \"\"\"\n", + "\n", + " if wall_locs is None:\n", + " raise ValueError('Please give me wall locations.')\n", + "\n", + " grid_size = mdp.size\n", + "\n", + " assert len(mdp.terminal_states) == 1, 'Only one terminal state supported.'\n", + " goal_loc = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(mdp.terminal_states[0], mdp.state_space))\n", + " # Attempt to make the desired gridworld.\n", + " reward_spec = {(goal_loc[0], goal_loc[1]): +1}\n", + "\n", + " tmb = TransitionMatrixBuilder(grid_size, has_terminal_state=True)\n", + "\n", + " # For the purposes of constructing the dynamics matrix\n", + " # and to match the way the library deals with MDPs, we set the\n", + " # walls to have affordable actions.\n", + " # affordances = affordances.copy()\n", + " # affordances[wall_locs, :] = 1\n", + " terminal_states = reward_spec.keys()\n", + " basic_affordance_grid = build_affordance_grid(\n", + " affordances,\n", + " size=mdp.size,\n", + " p_success=p_success,\n", + " terminal_states=terminal_states)\n", + "\n", + " tmb._P = basic_affordance_grid\n", + " for (r, c) in wall_locs:\n", + " tmb.add_wall_at((r, c))\n", + " P = tmb.P\n", + " _unit_test_P(P)\n", + " _checking_P(P)\n", + " return P\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cT39F8t1VHLQ", + "colab_type": "text" + }, + "source": [ + "## Affordances" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "A4sm9YlzrSP3", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Function to compute intent completion.\n", + "def _get_intent_completed(\n", + " mdp, state, action, statet,\n", + " threshold=0.0,\n", + " intent_name='collection',\n", + " P_AF=None):\n", + " \"\"\"Determines if a transition completed an intent.\n", + "\n", + " Args:\n", + " mdp: The MDP to evaluate the intent on.\n", + " state: The one hot representation of the current state\n", + " action: The integer representation of the action (currently unused).\n", + " statet: The state after taking the action.\n", + " threshold: The threshold to select actions with\n", + " intent_name: The name of the intent to calculate. Three intents are\n", + " currently supported \"collection\", \"up\" and \"left\". Collection is the union\n", + " over all intents.\n", + " P_AF: A probability transition matrix for the affordances.\n", + "\n", + " Returns:\n", + " An integer that represents if the intent is completed.\n", + " \"\"\"\n", + " x_t, y_t = mdp.unflatten_state(state)\n", + " x_tp1, y_tp1 = mdp.unflatten_state(statet)\n", + "\n", + " delta_x = x_t - x_tp1\n", + " delta_y = y_t - y_tp1\n", + "\n", + " state_int = convert_onehot_to_int(state)\n", + " next_state_int = convert_onehot_to_int(statet)\n", + "\n", + " # Select affordable actions based on the threshold.\n", + " # Default value is True for default threshold=0.0\n", + " prob_gt_threshold = True\n", + " if P_AF is not None:\n", + " prob_of_going_there = P_AF[state_int, action, next_state_int]\n", + " else:\n", + " prob_of_going_there = mdp.P[state_int, action, next_state_int]\n", + " prob_gt_threshold = prob_of_going_there >= threshold\n", + "\n", + " if intent_name == 'collection':\n", + " # If the agent has moved in any direction return a 1.0.\n", + " if (int(delta_x) or int(delta_y)) and prob_gt_threshold:\n", + " return 1.0\n", + " else:\n", + " return 0.0\n", + " elif intent_name == 'up':\n", + " if not(_is_absorbing(next_state_int, mdp.size)):\n", + " if (x_tp1 < x_t) and prob_gt_threshold:\n", + " return 1.0\n", + " else:\n", + " return 0.0\n", + " else:\n", + " if action==actions.UP:\n", + " return 1.0\n", + " else:\n", + " return 0.0\n", + " elif intent_name == 'left':\n", + " if not(_is_absorbing(next_state_int, mdp.size)):\n", + " if (y_tp1 < y_t) and prob_gt_threshold:\n", + " return 1.0\n", + " else:\n", + " return 0.0\n", + " else:\n", + " if action==actions.LEFT:\n", + " return 1.0\n", + " else:\n", + " return 0.0\n", + " else:\n", + " print(\"Not a valid Intent, See _get_intent_completed\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "nCjcsWdWTJmp", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Function to compute the affordances from an intent completion function.\n", + "def _compute_affordances(\n", + " mdp, n_states, n_actions, intent_name,\n", + " threshold, mdp_wall_locs):\n", + " '''\n", + " Args:\n", + " n_states: number of states in mdp\n", + " n_actions: number of actions in mdp \n", + " intent_name: name of the intent collection, up, etc.\n", + " threshold: float value between 0-1\n", + " mdp_wall_locs: list of mdp wall locations\n", + "\n", + " Returns:\n", + " Affordances in the form of a |S| * |A| array. The array has entries of\n", + " 1.0 or 0.0 based on the intent I_a(s') is true or not respectively.\n", + " '''\n", + " affordances = np.zeros((n_states, n_actions))\n", + " for s in range(n_states):\n", + " x_t, y_t = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(s, mdp.state_space))\n", + " if (x_t, y_t) in mdp_wall_locs:\n", + " # You're in a wall nothing is affordable.\n", + " continue\n", + " for a in range(n_actions):\n", + " if not(_is_absorbing(s, mdp.size)):\n", + " s_next = get_state_after_executing_action(a, s, mdp.size)\n", + " x, y = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(s_next, mdp.state_space))\n", + " if (x,y) in mdp_wall_locs:\n", + " s_next = s\n", + " intent = _get_intent_completed(\n", + " mdp, state=convert_int_rep_to_onehot(s, mdp.state_space),\n", + " action=a, statet=convert_int_rep_to_onehot(\n", + " s_next, mdp.state_space),\n", + " threshold=threshold, intent_name=intent_name)\n", + " affordances[s, a] = intent\n", + "\n", + " # Hard code terminal states to be affordable since all actions are possible.\n", + " if s in mdp.terminal_states:\n", + " affordances[s, :] = 1.0\n", + " # Hard code absorbing state to be affordable (Environmental quirk).\n", + " affordances[-1, :] = 1.0\n", + " return affordances" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kUZ6lanOJcTV", + "colab_type": "text" + }, + "source": [ + "## Value Iteration and Policy Evaluation algorithms" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "E5delmlK9xFf", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Code for value iteration.\n", + "# We build this function on the following base code:\n", + "# https://github.com/andrecianflone/policy_value_iteration\n", + "def value_iteration(\n", + " r, p, theta=0.0001, gamma=0.99, max_iteration=100,\n", + " AF=None, seed=None, min_mask_value=-10, mdp_wall_locs=None):\n", + " \"\"\"Value iteration computes value & policy for a reward and transiton matrix.\n", + "\n", + " Args:\n", + " r: Rewards, array of shape |S| x |A|.\n", + " p: State transition probabilities, array of shape |S| x |A| x |S|.\n", + " theta: Stop if the change in value fn is less than this value.\n", + " gamma: Discount factor.\n", + " max_iteration: Maximum number of iterations to run VI.\n", + " AF: Affordances of shape |S| x |A|.\n", + " seed: Seed value for randomness.\n", + " min_mask_value: An optional check for values to not be negative.\n", + " mdp_wall_locs: Wall locations in the mdp.\n", + "\n", + " Returns:\n", + " pi: Policy, |S| x |A|.\n", + " v: State values, |S|.\n", + " it: Number of iterations.\n", + " seconds: Planning time in seconds.\n", + " v_log: Log of value functions from init to convergence.\n", + " \"\"\"\n", + " if AF is not None:\n", + " AF = AF.copy()\n", + " assert mdp_wall_locs is not None, 'If AF is given, wall locs must also be given.'\n", + " if seed is not None:\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + " t1 = datetime.now()\n", + " n_states, n_actions = p.shape[:2]\n", + " v = np.zeros(n_states)\n", + " v_log = np.zeros((max_iteration + 1, n_states))\n", + "\n", + " if mdp_wall_locs:\n", + " wall_states_idx = list(\n", + " map(convert_onehot_to_int, map(mdp.flatten_state, mdp_wall_locs)))\n", + "\n", + " # Mask out walls to be zero. But uncovered states should be -inf.\n", + " if AF is not None:\n", + " # Accounting for emdp environment behaviour.\n", + " AF[wall_states_idx] = 1\n", + " mask = np.logical_not(AF).astype(np.float32)\n", + " for it in range(max_iteration+1):\n", + " q = r + gamma * np.einsum('ijk, k->ij', p, v)\n", + " minimum_v = np.min(q)\n", + "\n", + " if AF is not None:\n", + " q = AF * q + mask * minimum_v\n", + " \n", + " v_new = np.max(q, axis=1)\n", + " if np.all(np.absolute(v-v_new) < theta):\n", + " v = v_new\n", + " v_log[it, :] = v\n", + " v_iters = v_log[:it, :]\n", + " break\n", + " v = v_new\n", + " v_log[it, :] = v\n", + " v_iters = v_log[:it, :]\n", + "\n", + " # Greedy policy extraction.\n", + " q_values = r + gamma * np.einsum('ijk, k->ij', p, v_new)\n", + " if AF is not None:\n", + " q_values = q_values * AF + mask * minimum_v\n", + "\n", + " # Use \"random\" argmax with stochastic tie-breaking:\n", + " rargmax = lambda arr: np.random.choice(np.flatnonzero(arr))\n", + " best_actions = np.apply_along_axis(\n", + " rargmax, 1, np.isclose(q_values, q_values.max(-1, keepdims=True)))\n", + " pi = np.eye(r.shape[1])[best_actions]\n", + " assert pi.shape == r.shape\n", + " \n", + " t2 = datetime.now()\n", + " seconds = (t2 - t1).total_seconds()\n", + " return pi, v, it+1, seconds, v_log" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "72u1xS4tKM7R", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Code for policy evaluation.\n", + "\n", + "def _policy_evaluation_exact(pi, r, p, gamma=0.99):\n", + " \"\"\"\n", + " Evaluate policy by taking the inverse\n", + " Args:\n", + " pi: Policy, array of shape |S| x |A|.\n", + " r: Rewards, array of shape |S| x |A|.\n", + " p: State transition probabilities, array of shape |S| x |A| x |S|.\n", + " Return:\n", + " v: 1D array with updated state values\n", + " \"\"\"\n", + " # Rewards according to policy: Hadamard product and row-wise sum\n", + " r_pi = np.einsum('ij,ij->i', pi, r)\n", + "\n", + " # Policy-weighted transitions:\n", + " # multiply p by pi by broadcasting pi, then sum second axis\n", + " # result is an array of shape |S| x |S|\n", + " p_pi = np.einsum('ijk, ij->ik', p, pi)\n", + " v = np.dot(inv((np.eye(p_pi.shape[0]) - gamma*p_pi)), r_pi)\n", + " return v" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xVH35aHyLfj9", + "colab_type": "text" + }, + "source": [ + "#Sec 6.1 Experiment 1: Planning with Intents\n", + "Evaluating the impact of intents and affordances on planning. $||V^{\\pi^{*}_{I}}_{M} - V^{\\pi^{*}}_{M}||_{n}$" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LG2pxLBcSqC2", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 329 + }, + "outputId": "6af0376f-9a4d-4488-bd61-cdd0850ee95e" + }, + "source": [ + "#@title Evaluate policy obtained from MDP M vs I in original MDP M\n", + "#------------------------------------------------------------------------#\n", + "# 1. Compute optimal value function and optimal policy in I \n", + "# 2. Policy Evaluation in MDP M\n", + "# 3. Report planning time in M vs I \n", + "# 4. Report L2 value loss \n", + "#------------------------------------------------------------------------#\n", + "\n", + "p_success_probs = [1.0, 0.75, 0.50, 0.40, 0.30, 0.25]\n", + "thresholds = [0, 0.15, 0.25, 0.35, 0.45, 0.65, 0.75, 1.0]\n", + "max_iterations = 10000\n", + "\n", + "# Initialize empty arrays to store results.\n", + "valuefn_I = np.zeros(\n", + " (len(p_success_probs), len(thresholds), max_iterations+1, mdp.state_space))\n", + "v_pi_star_I_M_thresholds = np.zeros(\n", + " (len(p_success_probs), len(thresholds), mdp.state_space))\n", + "V_star_I_plan_time = np.zeros((len(p_success_probs), len(thresholds)))\n", + "V_star_M_plan_time = np.zeros(len(p_success_probs))\n", + "v_pi_star_M_M = np.zeros((len(p_success_probs), mdp.state_space))\n", + "\n", + "# Stores the ratio of |AF|/(|S| x |A|)\n", + "AF_SA_size_ratio = np.zeros((len(p_success_probs), len(thresholds)))\n", + "\n", + "# Iterate over the thresholds and success probs to create data for the plot.\n", + "for idx_p, p_success in enumerate(p_success_probs):\n", + " for ind, k in enumerate(thresholds):\n", + " # Create an mdp with a success prob of p_success\n", + " mdp, mdp_wall_locs = build_one_room_example(p_success=p_success)\n", + "\n", + " # |S| x |A| size in base MDP M.\n", + " state_action_space = mdp.state_space * mdp.action_space\n", + "\n", + " # Compute affordance matrix.\n", + " AF = _compute_affordances(\n", + " mdp=mdp,\n", + " n_states=mdp.state_space,\n", + " n_actions=mdp.action_space,\n", + " intent_name=\"collection\",\n", + " threshold=k,\n", + " mdp_wall_locs=mdp_wall_locs)\n", + " \n", + " # |S| x |A| size in intended MDP M_I\n", + " AF_size = np.count_nonzero(AF)\n", + " AF_SA_size_ratio[idx_p, ind] = AF_size/state_action_space\n", + " \n", + " #construct P_I with a determinsitic probability\n", + " P_affordances = _construct_dynamics(\n", + " mdp, affordances=AF, size=mdp.size, p_success=1.0,\n", + " wall_locs=mdp_wall_locs)\n", + "\n", + " _checking_P(P_affordances)\n", + "\n", + " try:\n", + " # Compute optimal value function in MDP I with affordances\n", + " (policy_star_I, V_star_I, _,\n", + " V_star_I_seconds, V_star_I_iters) = value_iteration(\n", + " mdp.R, P_affordances, max_iteration=max_iterations,\n", + " AF=AF, mdp_wall_locs=mdp_wall_locs)\n", + " except RuntimeError:\n", + " print(f'No affordances found for threshold={k}, p_success={p_success}.')\n", + " continue\n", + " \n", + " V_star_I_plan_time[idx_p, ind] = V_star_I_seconds\n", + " \n", + " # Evaluate the optimal policies from MDP I in the original mdp M.\n", + " v_pi_star_I_M_thresholds[idx_p, ind, :] = _policy_evaluation_exact(\n", + " pi=policy_star_I, \n", + " r=mdp.R, p=mdp.P, \n", + " gamma=mdp.gamma)\n", + " \n", + " # Compute optimal value function in original mdp.P\n", + " V_star_M = np.zeros(mdp.state_space)\n", + " (policy_star_M, V_star_M, _,\n", + " V_star_M_seconds, V_star_M_iters) = value_iteration(\n", + " mdp.R, mdp.P, max_iteration=max_iterations)\n", + "\n", + " V_star_M_plan_time[idx_p] = V_star_M_seconds\n", + "\n", + " # Evaluate the optimal policies in M in the original environment M \n", + " v_pi_star_M_M[idx_p] = _policy_evaluation_exact(\n", + " pi=policy_star_M, \n", + " r=mdp.R, \n", + " p=mdp.P, \n", + " gamma=mdp.gamma)\n", + "\n", + "# Compute the absolute error wrto the ground mdp.\n", + "value_loss_to_plot = np.zeros((len(p_success_probs), len(thresholds)))\n", + "for p_id in range(len(p_success_probs)):\n", + " for _thresh in range(len(thresholds)):\n", + " value_loss_to_plot[p_id, _thresh] = np.linalg.norm(\n", + " abs(v_pi_star_M_M[p_id, :] - \n", + " v_pi_star_I_M_thresholds[p_id, _thresh, :])\n", + " )\n", + "\n", + "fig, ax = plt.subplots()\n", + "x_axis = np.linspace(0, 1, len(thresholds))\n", + "for i in range(value_loss_to_plot.shape[0]):\n", + " ax.plot(\n", + " x_axis, value_loss_to_plot[i, :], color=colors[i],\n", + " label='p={}'.format(p_success_probs[i]), linewidth=3.00,\n", + " marker=markers[i], markersize=14)\n", + "\n", + "ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.19),\n", + " fancybox=True, shadow=True, ncol=7,\n", + " facecolor='w', fontsize=10)\n", + "ax.set_xlabel(\"Threshold $\\kappa$\", fontsize=18)\n", + "ax.set_ylabel(\"$ ||V^{\\pi^{*}_{I}}_M - V^{*}_M||_2$\", fontsize=18) \n", + "plt.title(\"Planning with Intents\")\n", + "matplotlib.rc('axes', edgecolor='black')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wWgD4QVJoNR3", + "colab_type": "text" + }, + "source": [ + "# Sec 6.2 Experiment 2: Planning time with & w/o Affordances" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WlY_AIqwGiqf", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Utility to create One-room and Pachinko Environments\n", + "\n", + "# One-Room gridsize_dict\n", + "ONE_ROOM_DIFF_GRIDSIZES = {\n", + " 7: \"\"\"#######\n", + "# #\n", + "# g #\n", + "# # #\n", + "# ### #\n", + "#s #\n", + "#######\"\"\".split('\\n'),\n", + " 9: \"\"\"#########\n", + "# #\n", + "# g #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "#########\"\"\".split('\\n'),\n", + " 13: \"\"\"#############\n", + "# #\n", + "# g #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# ####### #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "#############\"\"\".split('\\n'),\n", + " 15: \"\"\"###############\n", + "# #\n", + "# g #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "###############\"\"\".split('\\n'),\n", + " 17: \"\"\"#################\n", + "# #\n", + "# g #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# ########### #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "#################\"\"\".split('\\n'),\n", + " 19: \"\"\"###################\n", + "# #\n", + "# g #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# ############# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "###################\"\"\".split('\\n'),\n", + " 25: \"\"\"#########################\n", + "# #\n", + "# g #\n", + "# #\n", + "# #\n", + "# #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# # #\n", + "# ################# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "# #\n", + "#s #\n", + "#########################\"\"\".split('\\n')\n", + "}\n", + "\n", + "def build_one_room_gridsize(gamma=0.99, seed=2017,p_success=1.0, grid_size=13):\n", + " _ONE_ROOM_TXT = ONE_ROOM_DIFF_GRIDSIZES[grid_size]\n", + " char_matrix = get_char_matrix(_ONE_ROOM_TXT)\n", + " return build_gridworld_from_char_matrix(\n", + " char_matrix, p_success=p_success, seed=seed, gamma=gamma)\n", + "\n", + "\n", + "# pachinko_gridsize_dict\n", + "PACHINKO_DIFF_GRIDSIZES = {\n", + " 7: \"\"\"#######\n", + "# #\n", + "# # # #\n", + "# g #\n", + "# # # #\n", + "# s #\n", + "#######\"\"\".split('\\n'),\n", + " 9: \"\"\"#########\n", + "# #\n", + "# # # # #\n", + "# g #\n", + "# # # # #\n", + "# #\n", + "# # # # #\n", + "# s #\n", + "#########\"\"\".split('\\n'),\n", + " 13: \"\"\"#############\n", + "# #\n", + "# # # # # # #\n", + "# g #\n", + "# # # # # # #\n", + "# #\n", + "# # # # # # #\n", + "# #\n", + "# # # # # # #\n", + "# #\n", + "# # # # # # #\n", + "# s #\n", + "#############\"\"\".split('\\n'),\n", + " 15: \"\"\"###############\n", + "# #\n", + "# # # # # # # #\n", + "# g #\n", + "# # # # # # # #\n", + "# #\n", + "# # # # # # # #\n", + "# #\n", + "# # # # # # # #\n", + "# #\n", + "# # # # # # # #\n", + "# #\n", + "# # # # # # # #\n", + "# s #\n", + "###############\"\"\".split('\\n'),\n", + " 17: \"\"\"#################\n", + "# #\n", + "# # # # # # # # #\n", + "# g #\n", + "# # # # # # # # #\n", + "# #\n", + "# # # # # # # # #\n", + "# #\n", + "# # # # # # # # #\n", + "# #\n", + "# # # # # # # # #\n", + "# #\n", + "# # # # # # # # #\n", + "# #\n", + "# # # # # # # # #\n", + "# s #\n", + "#################\"\"\".split('\\n'),\n", + " 19: \"\"\"###################\n", + "# #\n", + "# # # # # # # # # #\n", + "# g #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # #\n", + "# s #\n", + "###################\"\"\".split('\\n'),\n", + " 25: \"\"\"#########################\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# g #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# #\n", + "# # # # # # # # # # # # #\n", + "# s #\n", + "#########################\"\"\".split('\\n')\n", + "}\n", + "\n", + "def build_pachinko_gridsize(gamma=0.99, seed=2017,p_success=1.0, grid_size=13):\n", + " _PACHINKO_ROOMS_TXT = PACHINKO_DIFF_GRIDSIZES[grid_size]\n", + " char_matrix = get_char_matrix(_PACHINKO_ROOMS_TXT)\n", + " return build_gridworld_from_char_matrix(\n", + " char_matrix, p_success=p_success, seed=seed, gamma=gamma)\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VYFvBmjloZ85", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 245 + }, + "outputId": "bad5093c-0733-4244-89d8-57dfa6d87cd5" + }, + "source": [ + "#@title Planning Time with Increasing Grid Size: One-Room\n", + "#------------------------------------------------------------------------#\n", + "# 1. MDP's will have p_success 0.5\n", + "# 2. Choose a threshold of 0.5 for AF computation\n", + "# 3. For different factors in range(factors):\n", + "# Compute V_star_M, V_star_M_I\n", + "# Planning time for each\n", + "# Return V_star_M, V_star_M_I plan times\n", + "# 4. Multiple runs to account for randomness\n", + "#------------------------------------------------------------------------#\n", + "\n", + "seed = 10000 \n", + "p_success = 0.5\n", + "threshold = 0.5\n", + "max_iterations = 10000\n", + "nruns = 10\n", + "grid_sizes = [7, 9, 13, 15, 17, 19, 25] \n", + "\n", + "V_star_I_plan_time = np.zeros((len(grid_sizes), nruns))\n", + "V_star_M_plan_time = np.zeros((len(grid_sizes), nruns))\n", + "\n", + "# Run VI for MDP M\n", + "for size_idx, size_val in enumerate(grid_sizes):\n", + " for run_id in range(nruns):\n", + " # Create an mdp with grid size size_val, fixed p\n", + " _mdp, _mdp_walls = build_one_room_gridsize(\n", + " grid_size=size_val, p_success=p_success)\n", + "\n", + " # Compute V* in original _mdp.P\n", + " (policy_star_M, V_star_M, _,\n", + " V_star_M_seconds, V_star_M_iters) = value_iteration(\n", + " _mdp.R, _mdp.P, max_iteration=max_iterations, \n", + " seed=seed+run_id)\n", + " \n", + " V_star_M_plan_time[size_idx, run_id] = V_star_M_seconds\n", + "\n", + " # Compute Affordances AF based on intent I\n", + " _AF = _compute_affordances(mdp=_mdp,\n", + " n_states=_mdp.state_space,\n", + " n_actions=_mdp.action_space,\n", + " intent_name=\"collection\",\n", + " threshold=threshold,\n", + " mdp_wall_locs=_mdp_walls)\n", + "\n", + " #construct P_I with a determinsitic probability\n", + " _P_affordances = _construct_dynamics(_mdp,\n", + " affordances=_AF,\n", + " size=_mdp.size,\n", + " p_success=1.0,\n", + " wall_locs=_mdp_walls)\n", + " _checking_P(_P_affordances)\n", + "\n", + " # Compute V*_I in mdp M_I with AF\n", + " (policy_star_I, V_star_I, _,\n", + " V_star_I_seconds, V_star_I_iters) = value_iteration(\n", + " _mdp.R, _P_affordances, max_iteration=max_iterations,\n", + " seed=seed+run_id)\n", + " \n", + " V_star_I_plan_time[size_idx, run_id] = V_star_I_seconds\n", + "\n", + "V_star_I_plan_time_avg = np.mean(V_star_I_plan_time, axis=1)\n", + "V_star_I_plan_time_std = np.std(V_star_I_plan_time, axis=1)\n", + "V_star_I_plan_time_CI = V_star_I_plan_time_std/np.sqrt(nruns)\n", + "\n", + "\n", + "V_star_M_plan_time_avg = np.mean(V_star_M_plan_time, axis=1)\n", + "V_star_M_plan_time_std = np.std(V_star_M_plan_time, axis=1)\n", + "V_star_M_plan_time_CI = V_star_M_plan_time_std/np.sqrt(nruns) \n", + "\n", + "#@title Plot Planning Time: TwoRooms\n", + "fig = plt.figure(figsize=(4,3.5))\n", + "ax1 = fig.add_subplot(1,1,1)\n", + "# sns.set_context(\"paper\")\n", + "sns.set_style('white')\n", + "sns.set_context(\"paper\", font_scale=1.85)\n", + "\n", + "fig.patch.set_facecolor('1.0')\n", + "\n", + "plt.grid(False)\n", + "x_axis = grid_sizes\n", + "xi = list(range(len(x_axis)))\n", + "\n", + "ax1.plot(xi, V_star_I_plan_time_avg, color = colors[1], label = '$V^{*}_{M_I}$', \n", + " linewidth=4.00, marker=markers[1], markersize=10)\n", + "ax1.fill_between(xi, V_star_I_plan_time_avg-V_star_I_plan_time_CI,\n", + " V_star_I_plan_time_avg+V_star_I_plan_time_CI,\n", + " facecolor=colors[1], edgecolor=colors[1], alpha=0.5)\n", + "\n", + "ax1.plot(xi, V_star_M_plan_time_avg, color = colors[3], label = '$V^{*}_{M}$',\n", + " linewidth=4.00, marker=markers[3], markersize=10)\n", + "ax1.fill_between(xi, V_star_M_plan_time_avg-V_star_M_plan_time_CI,\n", + " V_star_M_plan_time_avg+V_star_M_plan_time_CI,\n", + " facecolor=colors[3], edgecolor=colors[3], alpha=0.5)\n", + "\n", + "ax1.legend(loc='upper left', bbox_to_anchor=(0.05, 0.9),\n", + " fancybox=True, shadow=True, ncol=1,\n", + " facecolor='w', fontsize=15)\n", + "ax1.set_xlabel(\"Grid size\", fontsize=16)\n", + "ax1.set_ylabel(\"Planning Time\\n(seconds)\", fontsize=16) \n", + "xticks_pos = [0, 3, 6]\n", + "ax1.set_xticks(xticks_pos)\n", + "ax1.set_xticklabels([grid_sizes[i] for i in xticks_pos])\n", + "fig.tight_layout()\n", + "\n", + "plt.show()\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CyIEYxUubTNw", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 245 + }, + "outputId": "e68a7f32-bc80-4d00-9a45-25e6bc673fa9" + }, + "source": [ + "#@title Planning Time with Increasing Grid Size: Pachinko\n", + "#------------------------------------------------------------------------#\n", + "# 1. MDP's will have p_success 0.5\n", + "# 2. Choose a threshold of 0.5 for AF computation\n", + "# 3. For different factors in range(factors):\n", + "# Compute V_star_M, V_star_M_I\n", + "# Planning time for each\n", + "# Return V_star_M, V_star_M_I plan times\n", + "#------------------------------------------------------------------------#\n", + "\n", + "seed = 10000\n", + "p_success = 0.5\n", + "threshold = 0.5\n", + "max_iterations = 20000\n", + "grid_sizes = [7, 9, 13, 15, 17, 19, 25] \n", + "nruns = 10\n", + "\n", + "outer_dir = os.path.dirname('/content/')\n", + "dir_name = \"planning_valueloss\"\n", + "dir_name = os.path.join(outer_dir, dir_name)\n", + "if not os.path.exists(dir_name):\n", + " os.makedirs(dir_name)\n", + "\n", + "V_star_I_plan_time = np.zeros((len(grid_sizes), nruns))\n", + "V_star_M_plan_time = np.zeros((len(grid_sizes), nruns))\n", + "\n", + "\n", + "# Run VI for MDP M\n", + "for size_idx, size_val in enumerate(grid_sizes):\n", + " for run_id in range(nruns):\n", + " # Create an mdp with grid size size_val, fixed p\n", + " _mdp, _mdp_walls = build_pachinko_gridsize(\n", + " grid_size=size_val, p_success=p_success)\n", + "\n", + " # Compute V* in original _mdp.P\n", + " (policy_star_M, V_star_M, _,\n", + " V_star_M_seconds, V_star_M_iters) = value_iteration(\n", + " _mdp.R, _mdp.P, max_iteration=max_iterations,\n", + " seed=seed+run_id)\n", + " \n", + " V_star_M_plan_time[size_idx, run_id] = V_star_M_seconds\n", + "\n", + " # Compute Affordances AF based on intent I\n", + " _AF = _compute_affordances(mdp=_mdp,\n", + " n_states=_mdp.state_space,\n", + " n_actions=_mdp.action_space,\n", + " intent_name=\"collection\",\n", + " threshold=threshold,\n", + " mdp_wall_locs=_mdp_walls)\n", + " \n", + " \n", + " #construct P_I with a determinsitic probability\n", + " _P_affordances = _construct_dynamics(_mdp,\n", + " affordances=_AF,\n", + " size=_mdp.size,\n", + " p_success=1.0,\n", + " wall_locs=_mdp_walls)\n", + " _checking_P(_P_affordances)\n", + "\n", + " # Compute V*_I in mdp M_I with AF\n", + " (policy_star_I, V_star_I, _,\n", + " V_star_I_seconds, V_star_I_iters) = value_iteration(\n", + " _mdp.R, _P_affordances, max_iteration=max_iterations,\n", + " seed=seed+run_id)\n", + " \n", + " V_star_I_plan_time[size_idx, run_id] = V_star_I_seconds\n", + "\n", + "\n", + "V_star_I_plan_time_avg = np.mean(V_star_I_plan_time, axis=1)\n", + "V_star_I_plan_time_std = np.std(V_star_I_plan_time, axis=1)\n", + "V_star_I_plan_time_CI = V_star_I_plan_time_std/np.sqrt(nruns)\n", + "\n", + "\n", + "V_star_M_plan_time_avg = np.mean(V_star_M_plan_time, axis=1)\n", + "V_star_M_plan_time_std = np.std(V_star_M_plan_time, axis=1)\n", + "V_star_M_plan_time_CI = V_star_M_plan_time_std/np.sqrt(nruns) \n", + "\n", + "\n", + "#@title Plot Planning Time: Pachinko\n", + "fig = plt.figure(figsize=(4,3.5))\n", + "ax1 = fig.add_subplot(1,1,1)\n", + "sns.set_style('white')\n", + "sns.set_context(\"paper\", font_scale=1.85)\n", + "fig.patch.set_facecolor('1.0')\n", + "plt.grid(False)\n", + "x_axis = grid_sizes\n", + "xi = list(range(len(x_axis)))\n", + "\n", + "ax1.plot(xi, V_star_I_plan_time_avg, color = colors[1], label = '$V^{*}_{M_I}$', \n", + " linewidth=4.00, marker=markers[1], markersize=10)\n", + "ax1.fill_between(xi, V_star_I_plan_time_avg-V_star_I_plan_time_CI,\n", + " V_star_I_plan_time_avg+V_star_I_plan_time_CI,\n", + " facecolor=colors[1], edgecolor=colors[1], alpha=0.5)\n", + "\n", + "ax1.plot(xi, V_star_M_plan_time_avg, color = colors[3], label = '$V^{*}_{M}$',\n", + " linewidth=4.00, marker=markers[3], markersize=10)\n", + "ax1.fill_between(xi, V_star_M_plan_time_avg-V_star_M_plan_time_CI,\n", + " V_star_M_plan_time_avg+V_star_M_plan_time_CI,\n", + " facecolor=colors[3], edgecolor=colors[3], alpha=0.5)\n", + "\n", + "ax1.legend(loc='upper left', bbox_to_anchor=(0.05, 0.9),\n", + " fancybox=True, shadow=True, ncol=1,\n", + " facecolor='w', fontsize=15)\n", + "ax1.set_xlabel(\"Grid size\", fontsize=16)\n", + "ax1.set_ylabel(\"Planning Time\\n(seconds)\", fontsize=16) \n", + "xticks_pos = [0, 3, 6]\n", + "ax1.set_xticks(xticks_pos)\n", + "ax1.set_xticklabels([grid_sizes[i] for i in xticks_pos])\n", + "fig.tight_layout()\n", + "\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r05HA2UWhKz6", + "colab_type": "text" + }, + "source": [ + "# Sec 6.3 Experiment 3: Planning Loss Bound\n", + "$\\Big|\\Big|V^*_M - V^{\\pi^{*}_{\\hat{M}_{{{\\mathcal{A} \\mathcal{F}}_{\\cal I}}}}}_M \\Big|\\Big|_{\\infty}$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SAxhAKT3emQW", + "colab_type": "text" + }, + "source": [ + "## Model Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CZQ7dOYZfJZD", + "colab_type": "text" + }, + "source": [ + "To learn the model from the experience-data, we consider the count based approach:\n", + "\n", + "Transition Dynamics computation $\\hat{P_{\\cal I}}(s' \\mid s,a) = \\frac{1}{n} \\sum \\text{COUNT}(s,a, s')$\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fa0XlSG4co_v", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Helper functions to collect data and create transition matrices.\n", + "class Actions(enum.IntEnum):\n", + " LEFT = actions.LEFT\n", + " RIGHT = actions.RIGHT\n", + " UP = actions.UP\n", + " DOWN = actions.DOWN\n", + " \n", + "def get_randomized_state(mdp):\n", + " \"\"\"Generates a random state and sets mdp current state to that.\"\"\"\n", + " state_id = np.random.randint(0, mdp.state_space-1)\n", + " x, y = mdp.unflatten_state(convert_int_rep_to_onehot(state_id, mdp.state_space))\n", + " while (x,y) in mdp_wall_locs:\n", + " state_id = np.random.randint(0, mdp.state_space-1)\n", + " x, y = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(state_id, mdp.state_space))\n", + " mdp.set_current_state_to((x,y))\n", + " state_onehot = mdp.current_state\n", + " return state_onehot\n", + "\n", + "def get_trajectories_transitions(\n", + " mdp, num_trajectories=500, max_trajectory_length=50, policy=None,\n", + " intent_name='collection', random_starts=False, seed=None):\n", + " \"\"\"Takes transitions samples from an environment.\n", + "\n", + " Args:\n", + " mdp: The MDP to evaluate the intent on.\n", + " num_trajectories: The total number of trajectories to sample.\n", + " max_trajectory_length: The maximum length of the trajectory.\n", + " policy: The policy to sample using. If none is given a random policy\n", + " is used. The policy must take a single argument, the one hot\n", + " representation of the state. \n", + " intent_name: Name of the intent to be considered\n", + " random_starts: Data collection including random starts\n", + " seed: seed for randomness \n", + " Returns:\n", + " The trajectories collected from the environment:\n", + " This is a 4-tuple containing the batch of state, action, state' \n", + " and reward\n", + " Human Readable transitions:\n", + " A set containing the unique transitions in the trajectory batch.\n", + " \"\"\"\n", + " if seed is not None:\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + " print(f'seed set to {seed}')\n", + "\n", + " trajectory = []\n", + " if random_starts:\n", + " s_t = get_randomized_state(mdp)\n", + " else:\n", + " s_t = mdp.reset()\n", + " trajectory_length = 0\n", + " human_readable = set()\n", + " if policy is None:\n", + " def policy(_):\n", + " return np.random.randint(mdp.action_space)\n", + "\n", + " for _ in range(num_trajectories):\n", + " action = policy(s_t)\n", + " s_tp1, reward, done, _ = mdp.step(action)\n", + " state_int = get_current_state_integer(s_t)\n", + " intent = _get_intent_completed(\n", + " mdp, s_t, action, s_tp1, intent_name=intent_name)\n", + "\n", + " # Human readable vesion:\n", + " human_readable.add((\n", + " mdp.unflatten_state(s_t),\n", + " Actions(action),\n", + " mdp.unflatten_state(s_tp1),\n", + " reward))\n", + "\n", + " trajectory.append((\n", + " convert_onehot_to_int(s_t), action,\n", + " convert_onehot_to_int(s_tp1), reward)\n", + " )\n", + " trajectory_length += 1\n", + " if done or trajectory_length > max_trajectory_length:\n", + " if random_starts:\n", + " s_t = get_randomized_state(mdp)\n", + " else:\n", + " s_t = mdp.reset()\n", + " else:\n", + " s_t = s_tp1\n", + "\n", + " return trajectory, human_readable\n", + "\n", + "def get_stochastic_states_P(mdp, wall_locs):\n", + " \"\"\"\n", + " Function to initialize P with non-uniform probability\n", + " of transitioning to the neighborhood states.\n", + " Introduces stochasticity in actions.\n", + "\n", + " Args:\n", + " mdp: The mdp to get the affordances from.\n", + " wall_locs: The wall locations of the mdp.\n", + "\n", + " Returns:\n", + " P: initialized transition matrix\n", + " \"\"\"\n", + " grid_size = mdp.size\n", + "\n", + " assert len(mdp.terminal_states) == 1, 'only one terminal state supported.'\n", + " goal_loc = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(\n", + " mdp.terminal_states[0], mdp.state_space))\n", + "\n", + " # Attempt to make the desired gridworld.\n", + " reward_spec = {(goal_loc[0], goal_loc[1]): +1}\n", + "\n", + " tmb = TransitionMatrixBuilder(grid_size, has_terminal_state=True)\n", + " terminal_state = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(mdp.terminal_states, mdp.state_space))\n", + " stochastic_P = build_simple_grid_stochastic_states(\n", + " size=grid_size,\n", + " terminal_states=[terminal_state])\n", + " tmb._P = stochastic_P\n", + " for (r, c) in wall_locs:\n", + " tmb.add_wall_at((r, c))\n", + " P = tmb.P\n", + " _unit_test_P(P)\n", + " _checking_P(P)\n", + " return P\n", + "\n", + "def get_uniform_phat(mdp, wall_locs):\n", + " \"\"\"\n", + " Function to initialize P with uniform probability\n", + " of transitioning to the neighborhood states\n", + "\n", + " Args:\n", + " mdp: The to get a uniform starting state for.\n", + " wall_locs: The locations of the walls.\n", + "\n", + " Returns:\n", + " P: initialized transition matrix\n", + " \"\"\"\n", + " grid_size = mdp.size\n", + "\n", + " assert len(mdp.terminal_states) == 1, 'only one terminal state supported.'\n", + " goal_loc = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(\n", + " mdp.terminal_states[0], mdp.state_space))\n", + " # Attempt to make the desired gridworld.\n", + " reward_spec = {(goal_loc[0], goal_loc[1]): +1}\n", + "\n", + " tmb = TransitionMatrixBuilder(grid_size, has_terminal_state=True)\n", + " terminal_state_idx = mdp.unflatten_state(\n", + " convert_int_rep_to_onehot(mdp.terminal_states, mdp.state_space))\n", + " uniform_P = build_simple_grid(\n", + " size=grid_size, \n", + " terminal_states=[terminal_state_idx],\n", + " p_success=0.25)\n", + " tmb._P = uniform_P\n", + " for (r, c) in wall_locs:\n", + " tmb.add_wall_at((r, c))\n", + " P = tmb.P\n", + " _unit_test_P(P)\n", + " _checking_P(P)\n", + " return P\n", + "\n" + ], + "execution_count": 35, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "9NV8NBmVeoj1", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Function: Code to learn a count based model from data.\n", + "\n", + "def learn_model_from_data(\n", + " mdp, mdp_wall_locs,\n", + " num_trajectories=50, \n", + " max_trajectory_length=10,\n", + " random_starts=False,\n", + " affordances=None,\n", + " policy=None,\n", + " seed=None,\n", + " trajectories=None):\n", + " \"\"\"Learns a transition dynamics from sampled data.\n", + "\n", + " Args:\n", + " mdp: The MDP to sample from.\n", + " mdp_wall_locs: The wall locations in that mdp.\n", + " num_trajectories: The total number of trajectories to sample.\n", + " max_trajectory_length: The maximum length of the trajectory.\n", + " random_starts: rollouts are performed with random starting state.\n", + " affordances: The affordances of shape |S| * |A|\n", + " policy: The policy to sample using. If none is given a random policy\n", + " is used. The policy must take a single argument, the one hot\n", + " representation of the state. \n", + " seed: seed for randomness in any sampling used.\n", + " passing_data: set True if previously collected data is being passed\n", + " trajectories: tf trajectories from previously collected data\n", + " Returns:\n", + " The transition model P_hat\n", + " \"\"\"\n", + " if seed is not None:\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + "\n", + " Mhat = np.zeros((mdp.state_space, mdp.action_space, mdp.state_space))\n", + " Phat = np.zeros((mdp.state_space, mdp.action_space, mdp.state_space))\n", + "\n", + " if trajectories is None:\n", + " trajectories, _ = get_trajectories_transitions(\n", + " mdp=mdp, num_trajectories=num_trajectories,\n", + " max_trajectory_length=max_trajectory_length,\n", + " random_starts=random_starts)\n", + " else:\n", + " trajectories = trajectories\n", + "\n", + " for (s,a,s_dash, r) in trajectories:\n", + " Mhat[s,a,s_dash] += 1\n", + "\n", + " # Use get_random_phat instead of a naive initialization to take into account\n", + " # the wall locations and the fact that we cannot transition into walls.\n", + " Phat_init_uniform = get_uniform_phat(mdp, mdp_wall_locs)\n", + " \n", + " for (s, a, s_dash, r) in trajectories:\n", + " if affordances is None or affordances[s,a] !=0:\n", + " Phat[s,a,s_dash] = Mhat[s,a,s_dash]/np.sum(Mhat[s,a,:])\n", + "\n", + " # Copy over \"initialized uniform state transition\" here.\n", + " # i.e. if you have never seen a transition (s, a), \n", + " # P (s,a, s') = 1/4 (except walls)\n", + " P_visited_mask = np.sum(Phat, 2) == 0\n", + " for s in range(mdp.state_space):\n", + " for a in range(mdp.action_space):\n", + " if P_visited_mask[s, a]:\n", + " if affordances is None or affordances[s,a] >= 0.0:\n", + " Phat[s, a] = Phat_init_uniform[s, a]\n", + "\n", + " return Phat" + ], + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "fv5ilraeV7gD", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Planning Value Loss Evaluation\n", + "#------------------------------------------------------------------------#\n", + "# 1. Computationally build affordances\n", + "# 2. Learn a model for those affordances from the data experience\n", + "# 3. Compute optimal value function and optimal policy in M_M and M_hat_I_M\n", + "# 4. Policy Evaluation in MDP M \n", + "# 5. Compare loss for different number of samples \n", + "# 6. Hypothesis: the planning value loss shrinks with more and more data\n", + "# for low data regime, there would be an intermediate value of |AF_I| which\n", + "# yields the optimal planning value loss.\n", + "#------------------------------------------------------------------------#\n", + "\n", + "n_trajectories = [250, 400, 500, 750, 2000, 10000]\n", + "thresholds = [0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0]\n", + "\n", + "p_success = 0.70 #@param {type:\"slider\", min:0, max:1, step:0.1}\n", + "max_iterations = 10000 #@param {type:\"integer\"}\n", + "max_trajectory_length = 10#@param {type:\"integer\"}\n", + "seed = 10000 #@param {type:\"integer\"}\n", + "nruns = 10\n", + "\n", + "v_pi_star_Mhat_I_M_thresh_nsamples = np.zeros(\n", + " (nruns, len(n_trajectories), len(thresholds), mdp.state_space))\n", + "\n", + "# Create an mdp M\n", + "mdp, mdp_wall_locs = build_pachinko_gridsize(grid_size=19)\n", + "\n", + "mdp.P = get_stochastic_states_P(mdp, mdp_wall_locs)\n", + "\n", + "AF_sizes = np.zeros((len(thresholds)))\n", + "\n", + "for n_traj_idx, n_traj_val in enumerate(n_trajectories):\n", + " for ind, k in enumerate(thresholds):\n", + " for run_id in range(nruns):\n", + " # ------------Step 1: Compute Affordances ------------\n", + " # Compute Affordances AF based on intent I\n", + " AF = _compute_affordances(mdp=mdp,\n", + " n_states=mdp.state_space,\n", + " n_actions=mdp.action_space,\n", + " intent_name=\"collection\",\n", + " threshold=k,\n", + " mdp_wall_locs=mdp_wall_locs)\n", + " \n", + " \n", + " # ------------Step 2: Learn a Model for AF ------------\n", + " # Learn P_hat_I from data: M_hat_I - Model with AF\n", + " Phat_I = learn_model_from_data(\n", + " mdp, mdp_wall_locs,\n", + " num_trajectories=n_traj_val,\n", + " max_trajectory_length=max_trajectory_length,\n", + " random_starts=True,\n", + " affordances=AF,\n", + " policy=None,\n", + " seed=seed+run_id)\n", + "\n", + " # ------------Step 3: Value Iteration------------ \n", + " # Value iteration - pi*M_hat_I\n", + " (policy_star_Mhat_I, V_star_Mhat_I, _, \n", + " V_star_Mhat_I_seconds, V_star_Mhat_I_iters) = value_iteration(\n", + " mdp.R, Phat_I, max_iteration=max_iterations, seed=seed+run_id,\n", + " AF=AF, mdp_wall_locs=mdp_wall_locs)\n", + "\n", + " # # ------------Step 3: Policy Evaluation------------\n", + " # pi*M_hat_I evaluated in M\n", + " v_pi_star_Mhat_I_M_thresh_nsamples[run_id, n_traj_idx, ind, :] = _policy_evaluation_exact(pi=policy_star_Mhat_I,\n", + " r=mdp.R,\n", + " p=mdp.P,\n", + " gamma=mdp.gamma)\n", + "\n", + " AF_sizes[ind] = np.count_nonzero(AF) \n", + " \n", + "(policy_star_M, V_star_M, _, \n", + "V_star_M_seconds, V_star_M_iters) = value_iteration(mdp.R,\n", + " mdp.P,\n", + " max_iteration=max_iterations,\n", + " seed=seed) \n", + "# pi*M evaluated in M\n", + "v_pi_star_M_M = _policy_evaluation_exact(pi=policy_star_M,\n", + " r=mdp.R,\n", + " p=mdp.P,\n", + " gamma=mdp.gamma)\n", + "\n", + "value_loss_to_plot = np.zeros((len(n_trajectories), len(thresholds)))\n", + "value_loss_to_plot_std = np.zeros((len(n_trajectories), len(thresholds)))\n", + "value_loss_to_plot_CI = np.zeros((len(n_trajectories), len(thresholds)))\n", + "for n_id in range(len(n_trajectories)):\n", + " for _thresh in range(len(thresholds)):\n", + " value_loss_raw_values = [] \n", + " for run_id in range(nruns):\n", + " sliced_result = v_pi_star_Mhat_I_M_thresh_nsamples[run_id, n_id, _thresh, :]\n", + " value_loss_raw_values.append(np.linalg.norm(\n", + " abs(v_pi_star_M_M - sliced_result)))\n", + " value_loss_to_plot[n_id, _thresh] = np.mean(value_loss_raw_values)\n", + " value_loss_to_plot_std[n_id, _thresh] = np.std(value_loss_raw_values)\n", + " value_loss_to_plot_CI[n_id, _thresh] = value_loss_to_plot_std[n_id, _thresh]/np.sqrt(nruns)" + ], + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "OwWVblmnYqv0", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 333 + }, + "outputId": "d2bf4025-a399-479f-ae64-764fc2c3b645" + }, + "source": [ + "#@title Plotting Planning Value Loss Evaluation\n", + "\n", + "fig, ax1 = plt.subplots()\n", + "x_axis = thresholds\n", + "for i in range(value_loss_to_plot.shape[0]):\n", + " plt.plot(x_axis, value_loss_to_plot[i, :], color = colors[i], label = 'n={}'.format(int(n_trajectories[i]/max_trajectory_length)),\n", + " linewidth = 3.00, marker=markers[i], markersize=10)\n", + " plt.fill_between(x_axis, value_loss_to_plot[i, :]-value_loss_to_plot_CI[i,:],\n", + " value_loss_to_plot[i, :]+value_loss_to_plot_CI[i,:],\n", + " facecolor=colors[i], edgecolor=colors[i], alpha=0.25)\n", + "\n", + "ax1.set_xticks([0, 0.25, 0.5, 0.75, 1.0])\n", + "ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.19),\n", + " fancybox=True, shadow=True, ncol=7,\n", + " facecolor='w', fontsize=10)\n", + "ax1.set_xlabel(\"Threshold ($k$)\", fontsize=16)\n", + "ax1.set_ylabel(\"$||V^*_M - V^{\\pi^{*}_{\\hat{M}_{{{\\mathcal{A} \\mathcal{F}}_{\\cal I}}}}}_M ||_2$\", fontsize=18) \n", + "plt.title(\"Model Learning: Value Loss Analysis\")\n", + "matplotlib.rc('axes', edgecolor='black')\n", + "plt.show()" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "t3lzaMwFLi4D", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Helper function to plot affordance arrows.\n", + "\n", + "def plot_affordances_camera_ready(\n", + " nstates, nactions, affordances, mdp, mdp_wall_locs, ax=None,\n", + " headwidth=2.7, linewidths=1, scale=1.9, headlength=2,\n", + " wall_color=(0, 0, 0, 1), grid_kwargs=None, figsize=(14, 8)):\n", + " \"\"\"\n", + " Plots the environment with walls.\n", + " Args:\n", + " nstates: Number of state in the mdp\n", + " nactions: Number of actions in the mdp\n", + " affordances: Affordances of shape |S| * |A|\n", + " mdp: The mdp to plot\n", + " mdp_wall_locs:Locations of the walls for plotting them in a different color\n", + " ax: The axes to plot this on\n", + " headwidth: quiver arguments for arrows\n", + " linewidths: quiver arguments for arrows\n", + " scale: quiver arguments for arrows\n", + " headlength: quiver arguments for arrows\n", + " wall_color: RGB color of the walls\n", + " grid_kwargs: grid argrument specification.\n", + " figsize: Dimensions of the figure.\n", + "\n", + " Returns:\n", + " Visualization of the environment\n", + " \"\"\"\n", + " grid_kwargs = grid_kwargs or {}\n", + " if ax is None:\n", + " fig = plt.figure(figsize=figsize)\n", + " ax = fig.add_subplot(111)\n", + " plot_environment(\n", + " mdp, ax,\n", + " wall_locs=mdp_wall_locs,\n", + " plot_grid=True,\n", + " grid_kwargs=grid_kwargs,\n", + " wall_color=wall_color)\n", + " action_symbols = []\n", + " for s in range(nstates):\n", + " for a in range(nactions):\n", + " one_hot_state = convert_int_rep_to_onehot(s, nstates)\n", + " y_pos, x_pos = mdp.unflatten_state(one_hot_state)\n", + " if (y_pos, x_pos) not in mdp_wall_locs:\n", + " if affordances[s, a] == 1.0:\n", + " left_arrow = (-0.8, 0)\n", + " right_arrow = (0.8, 0)\n", + " up_arrow = (0, -0.8)\n", + " down_arrow = (0, 0.8)\n", + " if a == actions.LEFT: # Left\n", + " ax.quiver(\n", + " x_pos,y_pos,*left_arrow, color=DEFAULT_ARROW_COLOR, alpha=1.0,\n", + " angles='xy', scale_units='xy', scale=scale,\n", + " headwidth=headwidth, linewidths=linewidths,\n", + " headlength=headlength) #L\n", + " if a == actions.RIGHT: #Right\n", + " ax.quiver(\n", + " x_pos,y_pos,*right_arrow, color=DEFAULT_ARROW_COLOR, alpha=1.0,\n", + " angles='xy', scale_units='xy', scale=scale,\n", + " headwidth=headwidth, linewidths=linewidths,\n", + " headlength=headlength) #R\n", + " if a == actions.UP: #Up\n", + " ax.quiver(\n", + " x_pos,y_pos,*up_arrow, color=DEFAULT_ARROW_COLOR, alpha=1.0,\n", + " angles='xy', scale_units='xy', scale=scale,\n", + " headwidth=headwidth, linewidths=linewidths,\n", + " headlength=headlength) #U\n", + " if a == actions.DOWN: #Down\n", + " ax.quiver(\n", + " x_pos,y_pos,*down_arrow,color=DEFAULT_ARROW_COLOR, alpha=1.0,\n", + " angles='xy', scale_units='xy', scale=scale,\n", + " headwidth=headwidth, linewidths=linewidths,\n", + " headlength=headlength) #D\n", + "\n", + " return ax\n" + ], + "execution_count": 40, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0bM1DormLHHr", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 390 + }, + "outputId": "7e2c289a-03bc-4602-e066-373e317d2d8d" + }, + "source": [ + "#@title Plot affordances computed from transition matrix.\n", + "intent_name = 'collection' #@param [\"collection\", \"up\", \"left\"]\n", + "world_name = 'pachinko' #@param [\"pachinko\", \"one_room\", \"four_room\"]\n", + "grid_size = 7#@param {type:'integer'}\n", + "headwidth = 5#@param {type:'number'}\n", + "\n", + "if world_name == 'one_room':\n", + " mdp, mdp_wall_locs = build_one_room_gridsize(grid_size=grid_size)\n", + "elif world_name == 'pachinko':\n", + " mdp, mdp_wall_locs = build_pachinko_gridsize(grid_size=grid_size)\n", + "elif world_name == 'four_room':\n", + " mdp, mdp_wall_locs = build_four_rooms_example()\n", + "else:\n", + " raise ValueError('Unknown environment!')\n", + "\n", + "AF = _compute_affordances(\n", + " mdp,\n", + " mdp.state_space,\n", + " mdp.action_space,\n", + " intent_name=intent_name,\n", + " threshold=0.0,\n", + " mdp_wall_locs=mdp_wall_locs)\n", + "\n", + "if intent_name == 'up':\n", + " AF[mdp.terminal_states[0], :] = 0\n", + " AF[mdp.terminal_states[0], actions.LEFT] = 1\n", + "\n", + "plot_affordances_camera_ready(\n", + " mdp.state_space, \n", + " mdp.action_space, \n", + " affordances=AF, mdp=mdp,\n", + " mdp_wall_locs=mdp_wall_locs,\n", + " linewidths=0,\n", + " headwidth=headwidth,\n", + " headlength=4,\n", + " wall_color=(220, 220, 220, 0.5),\n", + " figsize=(5,5),\n", + " grid_kwargs={'color':(220 / 255, 220 / 255, 220 / 255, 0.8)}\n", + " )\n", + "plt.axis('off')\n", + "plt.tight_layout()" + ], + "execution_count": 43, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).\n" + ], + "name": "stderr" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HbXIGuRCX35O", + "colab_type": "text" + }, + "source": [ + "# Sec 7.1 Learn Affordances" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "K3p1iqV8waWh", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Helper function to collect data from environment.\n", + "class Intent(enum.IntEnum):\n", + " completed = 1.0\n", + " incomplete = 0.0\n", + "\n", + "def get_trajectories(\n", + " mdp, num_rollouts=500, max_trajectory_length=50, policy=None,\n", + " intent_name='collection', random_starts=False, seed=None):\n", + " \"\"\"Takes trajectory samples from an environment.\n", + "\n", + " Args:\n", + " mdp: The MDP to evaluate the intent on.\n", + " num_rollouts: The total number of trajectories to sample.\n", + " max_trajectory_length: The maximum length of the trajectory.\n", + " policy: The policy to sample using. If none is given a random policy\n", + " is used. The policy must take a single argument, the one hot\n", + " representation of the state. If using a tensorflow function make sure to\n", + " handle batching within the policy itself.\n", + " intent_name: The name of the intent to evalaute.\n", + " random_starts: randomly sampled start state per rollout\n", + " seed: set a specific seed\n", + "\n", + " Returns:\n", + " The trajectories collected from the environment:\n", + " This is a 4-tuple containing the batch of state, action, state' and intent\n", + " target.\n", + " Human Readable transitions:\n", + " A set containing the unique transitions in the trajectory batch and if the\n", + " intent was completed.\n", + " \"\"\"\n", + " if seed is not None:\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + " mdp.set_seed(seed)\n", + "\n", + " trajectory = []\n", + " if random_starts:\n", + " s_t = get_randomized_state(mdp)\n", + " else:\n", + " s_t = mdp.reset()\n", + " trajectory_length = 0\n", + " human_readable = set()\n", + " if policy is None:\n", + " def policy(_):\n", + " return np.random.randint(mdp.action_space)\n", + "\n", + " for _ in range(num_rollouts):\n", + " action = policy(s_t)\n", + " s_tp1, reward, done, _ = mdp.step(action)\n", + " state_int = get_current_state_integer(s_t)\n", + " intent = _get_intent_completed(\n", + " mdp, s_t, action, s_tp1, intent_name=intent_name)\n", + "\n", + " # Human readable vesion:\n", + " human_readable.add((\n", + " mdp.unflatten_state(s_t),\n", + " Actions(action),\n", + " mdp.unflatten_state(s_tp1),\n", + " Intent(intent)))\n", + "\n", + " # Prepare things for tensorflow:\n", + " s_tf = tf.constant(s_t.astype(np.float32))\n", + " s_tp1_tf = tf.constant(s_tp1.astype(np.float32))\n", + " a_tf = tf.one_hot(action, mdp.action_space)\n", + " # The mask will only consider the action that was actually taken.\n", + " mask_tf = tf.cast(a_tf, tf.float32)\n", + " # Computing targets\n", + " if intent > 0.0:\n", + " # if a completed the intent I_a(s')\n", + " targets = tf.one_hot(action, mdp.action_space)\n", + " else:\n", + " # vector of zeros if action a completed no intent.\n", + " targets = tf.zeros_like(a_tf)\n", + "\n", + " trajectory.append((\n", + " s_tf, a_tf, s_tp1_tf, targets)\n", + " )\n", + " trajectory_length += 1\n", + " if done or trajectory_length > max_trajectory_length:\n", + " if random_starts:\n", + " s_t = get_randomized_state(mdp)\n", + " else:\n", + " s_t = mdp.reset()\n", + " else:\n", + " s_t = s_tp1\n", + "\n", + " trajectory = list(map(tf.stack, zip(*trajectory)))\n", + " \n", + " return trajectory, human_readable" + ], + "execution_count": 44, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4GMq1t77eigL", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Function: Populate Learned Affordances\n", + "def get_learned_affordances(mdp, affordnet, classification_threshold=0.55):\n", + " '''Gets the learned affordances from the environment.\n", + " Args:\n", + " mdp: The mdp to get the affordances from.\n", + " affordnet: The affordance network.\n", + " classification_threshold: accuracy of the classifier\n", + "\n", + " Returns:\n", + " affordances in the form of a |S| * |A|\n", + " '''\n", + " n_states, n_actions = mdp.state_space, mdp.action_space\n", + " affordances = np.zeros((n_states, n_actions))\n", + " for s in range(n_states):\n", + " action_prob_predictions = affordnet(tf.eye(n_states))[s]\n", + " actions_affordable = tf.where(\n", + " tf.greater_equal(\n", + " action_prob_predictions, tf.constant(classification_threshold)))[:,-1]\n", + " for ind, a in enumerate(actions_affordable.numpy()):\n", + " affordances[s,a] = 1.0\n", + " return affordances" + ], + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "t789yH3IkGjR", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title Function: Affordance training code.\n", + "def train_afford_net(\n", + " mdp, network, optimizer,\n", + " intent_name='collection',\n", + " num_train_steps=10,\n", + " fresh_data=True,\n", + " num_rollouts=1,\n", + " max_trajectory_length=100,\n", + " optimize_performance=False,\n", + " debug=False,\n", + " print_losses=False,\n", + " random_starts=False,\n", + " passing_data=False,\n", + " trajectories=None,\n", + " unique_transitions=None):\n", + " \"\"\"Trains an affordance network.\n", + "\n", + " Args:\n", + " mdp: The mdp to collect training data from.\n", + " network: The affordance network.\n", + " optimizer: The optimizer to use for training.\n", + " intent_name: The name of the intent to train affordances for.\n", + " num_train_steps: The total number of training steps.\n", + " fresh_data: Use fresh data at every before completing a training step.\n", + " num_rollouts: The number of rollout trajectories per training step.\n", + " max_trajectory_length: The maximum length of each trajectory\n", + " optimizer_performance: Use tf.function to speed up training. (Right now\n", + " there are no apparent speed benefits of this function...?)\n", + " debug: Debug mode prints out the human readable transitions.\n", + " print_losses: Prints out the losses at every training step.\n", + " random_starts: randomly sampled start state per rollout\n", + " passing_data: set True if previously collected data is being passed\n", + " trajectories: tf trajectories from previously collected data\n", + " unique_transitions: tf unique_transitions from previously collected data\n", + " \"\"\" \n", + "\n", + " def _train_step(trajectory):\n", + " with tf.GradientTape() as tape:\n", + " s_t, a_t, s_tp1, intent_target = trajectory\n", + " preds = network(s_t)\n", + " mask = tf.cast(a_t, tf.float32)\n", + "\n", + " intent_target = tf.reshape(intent_target, (-1, 1))\n", + " preds = tf.reshape(preds, (-1, 1))\n", + " mask = tf.reshape(mask, (-1,))\n", + "\n", + " loss = tf.keras.losses.binary_crossentropy(intent_target, preds)\n", + " masked_loss = (loss * mask)\n", + "\n", + " total_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(1-mask)\n", + " grads = tape.gradient(total_loss, network.trainable_variables)\n", + " optimizer.apply_gradients(zip(grads, network.trainable_variables))\n", + "\n", + " return total_loss\n", + "\n", + " if optimize_performance and not debug:\n", + " print('Training step has been optimized.')\n", + " _train_step = tf.function(_train_step)\n", + "\n", + " initial_data_collected = False\n", + " for i in range(num_train_steps):\n", + " if not initial_data_collected or fresh_data:\n", + " initial_data_collected = True\n", + " running_time = time.time()\n", + " if not passing_data:\n", + " trajectories, unique_transitions = get_trajectories(\n", + " mdp, num_rollouts=num_rollouts,\n", + " max_trajectory_length=max_trajectory_length,\n", + " intent_name=intent_name,\n", + " random_starts=random_starts)\n", + " else:\n", + " trajectories = trajectories\n", + " unique_transitions = unique_transitions\n", + " collection_running_time = time.time() - running_time\n", + " if debug: print('unique_transitions:', unique_transitions)\n", + " running_time = time.time()\n", + " loss = _train_step(trajectories)\n", + " if debug or print_losses: \n", + " print(\n", + " 'loss:', loss.numpy().item(),\n", + " 'collection_loop_time', collection_running_time,\n", + " 'train_loop_time', time.time() - running_time)\n" + ], + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "bNMt4bEJKFJz", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 407 + }, + "outputId": "873aedbb-2205-4261-e513-b64a69753349" + }, + "source": [ + "intent_name = 'collection' #@param {type:'string'}\n", + "world_name = 'pachinko' #@param {type:'string'}\n", + "grid_size = 9#@param {type:'integer'}\n", + "headwidth = 5#@param {type:'number'}\n", + "classification_threshold = 0.95#@param {type:'number'}\n", + "num_transitions = 2000#@param {type:'number'}\n", + "max_trajectory_length = 10#@param {type:'number'}\n", + "\n", + "if world_name == 'one_room':\n", + " mdp, mdp_wall_locs = build_one_room_gridsize(grid_size=grid_size)\n", + "elif world_name == 'pachinko':\n", + " mdp, mdp_wall_locs = build_pachinko_gridsize(grid_size=grid_size)\n", + "elif world_name == 'four_room':\n", + " mdp, mdp_wall_locs = build_four_rooms_example()\n", + "\n", + "\n", + "network = tf.keras.layers.Dense(\n", + " mdp.action_space,\n", + " activation=tf.keras.activations.sigmoid)\n", + "\n", + "sgd = tf.keras.optimizers.Adam(learning_rate=0.1)\n", + "\n", + "train_afford_net(\n", + " mdp, \n", + " network,\n", + " sgd,\n", + " intent_name=intent_name,\n", + " num_train_steps=5000,\n", + " num_rollouts=num_transitions,\n", + " fresh_data=False,\n", + " max_trajectory_length=max_trajectory_length,\n", + " debug=False,\n", + " print_losses=False,\n", + " optimize_performance=True,\n", + " random_starts=True)\n", + "\n", + "AF_learned = get_learned_affordances(\n", + " mdp,\n", + " affordnet=network, \n", + " classification_threshold=0.95)\n", + "\n", + "plot_affordances_camera_ready(\n", + " mdp.state_space, \n", + " mdp.action_space, \n", + " affordances=AF_learned, mdp=mdp,\n", + " mdp_wall_locs=mdp_wall_locs,\n", + " linewidths=0,\n", + " headwidth=headwidth,\n", + " headlength=4,\n", + " wall_color=(220,220,220,0.5),\n", + " figsize=(5,5),\n", + " grid_kwargs={'color':(220/255,220/255,220/255,0.8)}\n", + " )\n", + "plt.axis('off')\n", + "plt.tight_layout()" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Training step has been optimized.\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).\n" + ], + "name": "stderr" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + } + ] +} \ No newline at end of file diff --git a/affordances_theory/README.md b/affordances_theory/README.md index 77eb8fd..84f105c 100644 --- a/affordances_theory/README.md +++ b/affordances_theory/README.md @@ -1,5 +1,8 @@ # Code for "What can I do here? A theory of affordances in reinforcement Learning. -This iPython notebook accompanies the paper "What can I do here? A theory of -affordances in reinforcmenet learning" and covers the experiments in Section 8. +This repository accompanies code for the paper "What can I do here? A theory of +affordances in reinforcmenet learning". +The iPython notebook AffordancesInDiscreteEnvironment.ipynb covers the experiments in Section 6 and Learning affordances in discrete environments. + +The iPython notebook AffordancesInContinuousEnvironment.ipynb covers the experiments in Section 7.