Open sourcing the physics inspired models code.

PiperOrigin-RevId: 408640606
2026-06-01 21:56:38 +08:00 · 2021-10-27 00:57:04 +01:00
parent 9b751b7d20
commit 2c7c401024
20 changed files with 5902 additions and 0 deletions
@@ -85,6 +85,7 @@ https://deepmind.com/research/publications/
 *   [REGAL: Transfer Learning for Fast Optimization of Computation Graphs](regal)
 *   [Deep Ensembles: A Loss Landscape Perspective](ensemble_loss_landscape)
 *   [Powerpropagation](powerpropagation)
 *   [Physics Inspired Models](physics_inspired_models)
@@ -0,0 +1,59 @@
 # Implementation of multiple physics inspired models for modelling dynamics
 This repository contains an implementation of different physics inspired models
 used in the papers: **SyMetric: Measuring the Quality of Learnt Hamiltonian
 Dynamics Inferred from Vision** and **Which priors matter? Benchmarking models
 for learning latent dynamics**.
 ## Contributing
 This is purely research code, provided with no further intentions of support or
 any guarantees of backward compatibility.
 ## Installation
 All package requirements are listed in `requirements.txt`.
 You will still need to download and setup the datasets from the
 [DeepMind Hamiltonian Dynamics Suite] manually.
 ```shell
 git clone git@github.com:deepmind/deepmind-research.git
 pip install -r ./deepmind_research/physics_inspired_models/requirements.txt
 pip install ./deepmind_research/physics_inspired_models
 pip install --upgrade "jax[XXX]"
 ```
 where `XXX` is the correct type of accelerator that you have on your machine.
 Note that if you are using a GPU you might need `XXX` to also include the
 correct version of CUDA and cuDNN installed on your machine.
 For more details please read [here](https://github.com/google/jax#installation).
 ## Usage
 The file `jaxline_configs.py` contains all the configurations specifications for
 the experiments in the two papers. To run an experiment, in addition to passing
 the location of the configs file, you must provide extra arguments in the
 following manner:
 `${name_of_configuration},${index_in_sweep},${dataset_name}`
 For example to run the second hyper-parameter configuration of the improved
 Hamiltonian Generative Network (HGN++) on the mass-spring dataset you should
 run in the command line (assuming that you are in the folder of the project):
 ```shell
 python3 jaxline_train.py \
  --config="jaxline_configs.py:sym_metric_hgn_plus_plus_sweep,1,toy_physics/mass_spring" \
  --jaxline_mode="train" \
  --logtostderr
 ```
 ## Reference
 **SyMetric: Measuring the Quality of Learnt Hamiltonian Dynamics Inferred from Vision**
 **Which priors matter? Benchmarking models for learning latent dynamics**
 [DeepMind Hamiltonian Dynamics Suite]: https://github.com/deepmind/dm_hamiltonian_dynamics_suite
@@ -0,0 +1,14 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -0,0 +1,353 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module containing model evaluation metric."""
 import _thread as thread
 import sys
 import threading
 import time
 import warnings
 from absl import logging
 import distrax
 import numpy as np
 from sklearn import linear_model
 from sklearn import model_selection
 from sklearn import preprocessing
 def quit_function(fn_name):
  logging.error('%s took too long', fn_name)
  sys.stderr.flush()
  thread.interrupt_main()
 def exit_after(s):
  """Use as decorator to exit function after s seconds."""
  def outer(fn):
    def inner(*args, **kwargs):
      timer = threading.Timer(s, quit_function, args=[fn.__name__])
      timer.start()
      try:
        result = fn(*args, **kwargs)
      finally:
        timer.cancel()
      return result
    return inner
  return outer
@exit_after(400)
 def do_grid_search(data_x_exp, data_y, clf, parameters, cv):
  scoring_choice = 'explained_variance'
  regressor = model_selection.GridSearchCV(
      clf, parameters, cv=cv, refit=True, scoring=scoring_choice)
  regressor.fit(data_x_exp, data_y)
  return regressor
 def symplectic_matrix(dim):
  """Return anti-symmetric identity matrix of given dimensionality."""
  half_dims = int(dim/2)
  eye = np.eye(half_dims)
  zeros = np.zeros([half_dims, half_dims])
  top_rows = np.concatenate([zeros, - eye], axis=1)
  bottom_rows = np.concatenate([eye, zeros], axis=1)
  return np.concatenate([top_rows, bottom_rows], axis=0)
 def create_latent_mask(z0, dist_std_threshold=0.5):
  """Create mask based on informativeness of each latent dimension.
  For stochastic models those latent dimensions that are too close to the prior
  are likely to be uninformative and can be ignored.
  Args:
    z0: distribution or array of phase space
    dist_std_threshold: informative latents have average inferred stds <
      dist_std_threshold
  Returns:
    latent_mask_final: boolean mask of the same dimensionality as z0
  """
  if isinstance(z0, distrax.Normal):
    std_vals = np.mean(z0.variance(), axis=0)
  elif isinstance(z0, distrax.Distribution):
    raise NotImplementedError()
  else:
    # If the latent is deterministic, pass through all dimensions
    return np.array([True]*z0.shape[-1])
  tensor_shape = std_vals.shape
  half_dims = int(tensor_shape[-1] / 2)
  std_vals_q = std_vals[:half_dims]
  std_vals_p = std_vals[half_dims:]
  # Keep both q and corresponding p as either one is informative
  informative_latents_inds = np.array([
      x for x in range(len(std_vals_q)) if
      std_vals_q[x] < dist_std_threshold or std_vals_p[x] < dist_std_threshold
  ])
  if informative_latents_inds.shape[0] > 0:
    latent_mask_final = np.zeros_like(std_vals_q)
    latent_mask_final[informative_latents_inds] = 1
    latent_mask_final = np.concatenate([latent_mask_final, latent_mask_final])
    latent_mask_final = latent_mask_final == 1
    return latent_mask_final
  else:
    return np.array([True]*tensor_shape[-1])
 def standardize_data(data):
  """Applies the sklearn standardization to the data."""
  scaler = preprocessing.StandardScaler()
  scaler.fit(data)
  return scaler.transform(data)
 def find_best_polynomial(data_x, data_y, max_poly_order, rsq_threshold,
                         max_dim_n=32,
                         alpha_sweep=None,
                         max_iter=1000, cv=2):
  """Find minimal polynomial expansion that is sufficient to explain data using Lasso regression."""
  rsq = 0
  poly_order = 1
  if not np.any(alpha_sweep):
    alpha_sweep = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
  # Avoid a large polynomial expansion for large latent sizes
  if data_x.shape[-1] > max_dim_n:
    print(f'>WARNING! Data is too high dimensional at {data_x.shape[-1]}')
    print('>WARNING! Setting max_poly_order = 1')
    max_poly_order = 1
  while rsq < rsq_threshold and poly_order <= max_poly_order:
    time_start = time.perf_counter()
    poly = preprocessing.PolynomialFeatures(poly_order, include_bias=False)
    data_x_exp = poly.fit_transform(data_x)
    time_end = time.perf_counter()
    print(
        f'Took {time_end-time_start}s to create polynomial features of order '
        f'{poly_order} and size {data_x_exp.shape[1]}.')
    with warnings.catch_warnings():
      warnings.simplefilter('ignore')
      time_start = time.perf_counter()
      clf = linear_model.Lasso(
          random_state=0, max_iter=max_iter, normalize=False, warm_start=False)
      parameters = {'alpha': alpha_sweep}
      try:
        regressor = do_grid_search(data_x_exp, data_y, clf, parameters, cv)
        time_end = time.perf_counter()
        print(f'Took {time_end-time_start}s to do regression grid search.')
        # Get rsq results
        time_start = time.perf_counter()
        clf = linear_model.Lasso(
            random_state=0,
            alpha=regressor.best_params_['alpha'],
            max_iter=max_iter,
            normalize=False,
            warm_start=False)
        clf.fit(data_x_exp, data_y)
        rsq = clf.score(data_x_exp, data_y)
        time_end = time.perf_counter()
        print(f'Took {time_end-time_start}s to get rsq results.')
        old_regressor = regressor
        old_poly_order = poly_order
        old_poly = poly
        old_data_x_exp = data_x_exp
        old_rsq = rsq
        old_clf = clf
        print(f'Polynomial of order {poly_order} with '
              f' alpha={regressor.best_params_} RSQ: {rsq}')
        poly_order += 1
      except KeyboardInterrupt:
        time_end = time.perf_counter()
        print(f'Timed out after {time_end-time_start}s of doing grid search.')
        print(f'Continuing with previous poly_order={old_poly_order}...')
        regressor = old_regressor
        poly_order = old_poly_order
        poly = old_poly
        data_x_exp = old_data_x_exp
        rsq = old_rsq
        clf = old_clf
        print(f'Polynomial of order {poly_order} with '
              f' alpha={regressor.best_params_} RSQ: {rsq}')
        break
  return clf, poly, data_x_exp, rsq
 def eval_monomial_grad(feature, x, w, grad_acc):
  """Accumulates gradient from polynomial features and their weights."""
  features = feature.split(' ')
  variable_indices = []
  grads = np.ones(len(features)) * w
  for i, feature in enumerate(features):
    name_and_power = feature.split('^')
    if len(name_and_power) == 1:
      name, power = name_and_power[0], 1
    else:
      name, power = name_and_power
      power = int(power)
    var_index = int(name[1:])
    variable_indices.append(var_index)
    new_prod = np.ones_like(grads) * (x[var_index] ** power)
    # This needs a special case, for situation where x[index] = 0.0
    if power == 1:
      new_prod[i] = 1.0
    else:
      new_prod[i] = power * (x[var_index] ** (power - 1))
    grads = grads * new_prod
  grad_acc[variable_indices] += grads
  return grad_acc
 def compute_jacobian_manual(x, polynomial_features, weight_matrix, tolerance):
  """Computes the jacobian manually."""
  # Put together the equation for each output var
  # polynomial_features = np.array(polynomial_obj.get_feature_names())
  weight_mask = np.abs(weight_matrix) > tolerance
  weight_matrix = weight_mask * weight_matrix
  jacobians = list()
  for i in range(weight_matrix.shape[0]):
    grad_accumulator = np.zeros_like(x)
    for j, feature in enumerate(polynomial_features):
      eval_monomial_grad(feature, x, weight_matrix[i, j], grad_accumulator)
    jacobians.append(grad_accumulator)
  return np.stack(jacobians)
 def calculate_jacobian_prod(jacobian, noise_eps=1e-6):
  """Calculates AA*, where A=JEJ^T and A*=JE^TJ^T, which should be I."""
  # Add noise as 0 in jacobian creates issues in calculations later
  jacobian = jacobian + noise_eps
  sym_matrix = symplectic_matrix(jacobian.shape[1])
  pred = np.matmul(jacobian, sym_matrix)
  pred = np.matmul(pred, np.transpose(jacobian))
  pred_t = np.matmul(jacobian, np.transpose(sym_matrix))
  pred_t = np.matmul(pred_t, np.transpose(jacobian))
  pred_id = np.matmul(pred, pred_t)
  return pred_id
 def normalise_jacobian_prods(jacobian_preds):
  """Normalises Jacobians evaluated at various points by a constant."""
  stacked_preds = np.stack(jacobian_preds)
  # For each attempt at estimating E, get the max term, and take their average
  normalisation_factor = np.mean(np.max(np.abs(stacked_preds), axis=(1, 2)))
  if normalisation_factor != 0:
    stacked_preds = stacked_preds/normalisation_factor
  return stacked_preds
 def calculate_symetric_score(
    gt_data,
    model_data,
    max_poly_order,
    max_sym_score,
    rsq_threshold,
    sym_threshold,
    evaluation_point_n,
    trajectory_n=1,
    weight_tolerance=1e-5,
    alpha_sweep=None,
    max_iter=1000,
    cv=2):
  """Finds minimal polynomial expansion to explain data using Lasso regression, gets the Jacobian of the mapping and calculates how symplectic the map is."""
  model_data = model_data[..., :gt_data.shape[0], :]
  # Fing polynomial expansion that explains enough variance in the gt data
  print('Finding best polynomial expansion...')
  time_start = time.perf_counter()
  # Clean up model data to ensure it doesn't contain NaN, infinity
  # or values too large for dtype('float32')
  model_data = np.nan_to_num(model_data)
  model_data = np.clip(model_data, -999999, 999999)
  clf, poly, model_data_exp, best_rsq = find_best_polynomial(
      model_data, gt_data, max_poly_order, rsq_threshold,
      32, alpha_sweep, max_iter, cv)
  time_end = time.perf_counter()
  print(f'Took {time_end - time_start}s to find best polynomial.')
  # Calculate Symplecticity score
  all_raw_scores = []
  features = np.array(poly.get_feature_names())
  points_per_trajectory = int(len(gt_data) / trajectory_n)
  for trajectory in range(trajectory_n):
    random_data_inds = np.random.permutation(
        range(points_per_trajectory))[:evaluation_point_n]
    jacobian_preds = []
    for point_ind in random_data_inds:
      input_data_point = model_data[points_per_trajectory * trajectory +
                                    point_ind]
      time_start = time.perf_counter()
      jacobian = compute_jacobian_manual(input_data_point, features,
                                         clf.coef_, weight_tolerance)
      pred = calculate_jacobian_prod(jacobian)
      jacobian_preds.append(pred)
      time_end = time.perf_counter()
      print(f'Took {time_end - time_start}s to evaluate jacobian '
            f'around point {point_ind}.')
    # Normalise
    normalised_jacobian_preds = normalise_jacobian_prods(jacobian_preds)
    # The score is measured as the deviation from I
    identity = np.eye(normalised_jacobian_preds.shape[-1])
    scores = np.mean(np.power(normalised_jacobian_preds - identity, 2),
                     axis=(1, 2))
    all_raw_scores.append(scores)
  sym_score = np.min([np.mean(all_raw_scores), max_sym_score])
  # Calculate final SyMetric score
  if best_rsq > rsq_threshold and sym_score < sym_threshold:
    sy_metric = 1.0
  else:
    sy_metric = 0.0
  results = {
      'poly_exp_order': poly.get_params()['degree'],
      'rsq': best_rsq,
      'sym': sym_score,
      'SyMetric': sy_metric,
  }
  with np.printoptions(precision=4, suppress=True):
    print(f'----------------FINAL RESULTS FOR {trajectory_n} '
          'TRAJECTORIES------------------')
    print(f'BEST POLYNOMIAL EXPANSION ORDER: {results["poly_exp_order"]}')
    print(f'BEST RSQ (1-best): {results["rsq"]}')
    print(f'SYMPLECTICITY SCORE AROUND ALL POINTS AND ALL '
          f'TRAJECTORIES (0-best): {sym_score}')
    print(f'SyMETRIC SCORE: {sy_metric}')
    print(f'----------------FINAL RESULTS FOR {trajectory_n} '
          f'TRAJECTORIES------------------')
  return results, clf, poly, model_data_exp
@@ -0,0 +1,397 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module containing all of the configurations for various models."""
 import copy
 import os
 from jaxline import base_config
 import ml_collections as collections
 _DATASETS_PATH_VAR_NAME = "DM_HAMILTONIAN_DYNAMICS_SUITE_DATASETS"
 def get_config(arg_string):
  """Return config object for training."""
  args = arg_string.split(",")
  if len(args) != 3:
    raise ValueError("You must provide exactly three arguments separated by a "
                     "comma - model_config_name,sweep_index,dataset_name.")
  model_config_name, sweep_index, dataset_name = args
  sweep_index = int(sweep_index)
  config = base_config.get_base_config()
  config.random_seed = 123109801
  config.eval_modes = ("eval", "eval_metric")
  # Get the model config and the sweeps
  if model_config_name not in globals():
    raise ValueError(f"The config name {model_config_name} does not exist in "
                     f"jaxline_configs.py")
  config_and_sweep_fn = globals()[model_config_name]
  model_config, sweeps = config_and_sweep_fn()
  if not os.environ.get(_DATASETS_PATH_VAR_NAME, None):
    raise ValueError(f"You need to set the {_DATASETS_PATH_VAR_NAME}")
  dm_hamiltonian_suite_path = os.environ[_DATASETS_PATH_VAR_NAME]
  dataset_folder = os.path.join(dm_hamiltonian_suite_path, dataset_name)
  # Experiment config. Note that batch_size is per device.
  # In the experiments we run on 4 GPUs, so the effective batch size was 128.
  config.experiment_kwargs = collections.ConfigDict(
      dict(
          config=dict(
              dataset_folder=dataset_folder,
              model_kwargs=model_config,
              num_extrapolation_steps=60,
              drop_stats_containing=("neg_log_p_x", "l2_over_time", "neg_elbo"),
              optimizer=dict(
                  name="adam",
                  kwargs=dict(
                      learning_rate=1.5e-4,
                      b1=0.9,
                      b2=0.999,
                  )
              ),
              training=dict(
                  batch_size=32,
                  burnin_steps=5,
                  num_epochs=None,
                  lagging_vae=False
              ),
              evaluation=dict(
                  batch_size=64,
              ),
              evaluation_metric=dict(
                  batch_size=5,
                  batch_n=20,
                  num_eval_metric_steps=60,
                  max_poly_order=5,
                  max_jacobian_score=1000,
                  rsq_threshold=0.9,
                  sym_threshold=0.05,
                  evaluation_point_n=10,
                  weight_tolerance=1e-03,
                  max_iter=1000,
                  cv=2,
                  alpha_min_logspace=-4,
                  alpha_max_logspace=-0.5,
                  alpha_step_n=10,
                  calculate_fully_after_steps=40000,
              ),
              evaluation_metric_mlp=dict(
                  batch_size=64,
                  batch_n=10000,
                  datapoint_param_multiplier=1000,
                  num_eval_metric_steps=60,
                  evaluation_point_n=10,
                  evaluation_trajectory_n=50,
                  rsq_threshold=0.9,
                  sym_threshold=0.05,
                  ridge_lambda=0.01,
                  model=dict(
                      num_units=4,
                      num_layers=4,
                      activation="tanh",
                  ),
                  optimizer=dict(
                      name="adam",
                      kwargs=dict(
                          learning_rate=1.5e-3,
                      )
                  ),
              ),
              evaluation_vpt=dict(
                  batch_size=5,
                  batch_n=2,
                  vpt_threshold=0.025,
              )
          )
      )
  )
  # Training loop config.
  config.training_steps = int(500000)
  config.interval_type = "steps"
  config.log_tensors_interval = 50
  config.log_train_data_interval = 50
  config.log_all_train_data = False
  config.save_checkpoint_interval = 100
  config.checkpoint_dir = "/tmp/physics_inspired_models/"
  config.train_checkpoint_all_hosts = False
  config.eval_specific_checkpoint_dir = ""
  config.update_from_flattened_dict(sweeps[sweep_index])
  return config
 config_prefix = "experiment_kwargs.config."
 model_prefix = config_prefix + "model_kwargs."
 default_encoder_kwargs = collections.ConfigDict(dict(
    conv_channels=64,
    num_blocks=3,
    blocks_depth=2,
    activation="leaky_relu",
 ))
 default_decoder_kwargs = collections.ConfigDict(dict(
    conv_channels=64,
    num_blocks=3,
    blocks_depth=2,
    activation="leaky_relu",
 ))
 default_latent_system_net_kwargs = collections.ConfigDict(dict(
    conv_channels=64,
    num_units=250,
    num_layers=5,
    activation="swish",
 ))
 default_latent_system_kwargs = collections.ConfigDict(dict(
    # Physics model arguments
    input_space=collections.config_dict.placeholder(str),
    simulation_space=collections.config_dict.placeholder(str),
    potential_func_form="separable_net",
    kinetic_func_form=collections.config_dict.placeholder(str),
    hgn_kinetic_func_form="separable_net",
    lgn_kinetic_func_form="matrix_dep_quad",
    parametrize_mass_matrix=collections.config_dict.placeholder(bool),
    hgn_parametrize_mass_matrix=False,
    lgn_parametrize_mass_matrix=True,
    mass_eps=1.0,
    # ODE model arguments
    integrator_method=collections.config_dict.placeholder(str),
    # RGN model arguments
    residual=collections.config_dict.placeholder(bool),
    # General arguments
    net_kwargs=default_latent_system_net_kwargs
 ))
 default_config_dict = collections.ConfigDict(dict(
    name=collections.config_dict.placeholder(str),
    latent_system_dim=32,
    latent_system_net_type="mlp",
    latent_system_kwargs=default_latent_system_kwargs,
    encoder_aggregation_type="linear_projection",
    decoder_de_aggregation_type=collections.config_dict.placeholder(str),
    encoder_kwargs=default_encoder_kwargs,
    decoder_kwargs=default_decoder_kwargs,
    has_latent_transform=False,
    num_inference_steps=5,
    num_target_steps=60,
    latent_training_type="forward",
    # Choices: overlap_by_one, no_overlap, include_inference
    training_data_split="overlap_by_one",
    objective_type="ELBO",
    elbo_beta_delay=0,
    elbo_beta_final=1.0,
    geco_kappa=0.001,
    geco_alpha=0.0,
    dt=0.125,
 ))
 hgn_paper_encoder_kwargs = collections.ConfigDict(dict(
    conv_channels=[[32, 64], [64, 64], [64]],
    num_blocks=3,
    blocks_depth=2,
    activation="relu",
    kernel_shapes=[2, 4],
    padding=["VALID", "SAME"],
 ))
 hgn_paper_decoder_kwargs = collections.ConfigDict(dict(
    conv_channels=64,
    num_blocks=3,
    blocks_depth=2,
    activation="tf_leaky_relu",
 ))
 hgn_paper_latent_net_kwargs = collections.ConfigDict(dict(
    conv_channels=[32, 64, 64, 64],
    num_units=250,
    num_layers=5,
    activation="softplus",
    kernel_shapes=[3, 2, 2, 2, 2],
    strides=[1, 2, 1, 2, 1],
    padding=["SAME", "VALID", "SAME", "VALID", "SAME"]
 ))
 hgn_paper_latent_system_kwargs = collections.ConfigDict(dict(
    potential_func_form="separable_net",
    kinetic_func_form="separable_net",
    parametrize_mass_matrix=False,
    net_kwargs=hgn_paper_latent_net_kwargs
 ))
 hgn_paper_latent_transform_kwargs = collections.ConfigDict(dict(
    num_layers=5,
    conv_channels=64,
    num_units=64,
    activation="relu",
 ))
 hgn_paper_config = copy.deepcopy(default_config_dict)
 hgn_paper_config.training_data_split = "include_inference"
 hgn_paper_config.latent_system_net_type = "conv"
 hgn_paper_config.encoder_aggregation_type = (collections.config_dict.
                                             placeholder(str))
 hgn_paper_config.decoder_de_aggregation_type = (collections.config_dict.
                                                placeholder(str))
 hgn_paper_config.latent_system_kwargs = hgn_paper_latent_system_kwargs
 hgn_paper_config.encoder_kwargs = hgn_paper_encoder_kwargs
 hgn_paper_config.decoder_kwargs = hgn_paper_decoder_kwargs
 hgn_paper_config.has_latent_transform = True
 hgn_paper_config.latent_transform_kwargs = hgn_paper_latent_transform_kwargs
 hgn_paper_config.num_inference_steps = 31
 hgn_paper_config.num_target_steps = 0
 hgn_paper_config.objective_type = "GECO"
 forward_overlap_by_one = {
    model_prefix + "latent_training_type": "forward",
    model_prefix + "training_data_split": "overlap_by_one",
 }
 forward_backward_include_inference = {
    model_prefix + "latent_training_type": "forward_backward",
    model_prefix + "training_data_split": "include_inference",
 }
 latent_training_sweep = [
    forward_overlap_by_one,
    forward_backward_include_inference,
 ]
 def sym_metric_hgn_plus_plus_sweep():
  """HGN++ experimental sweep for the SyMetric paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "HGN"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    sweeps.append({
        config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
        model_prefix + "latent_training_type": "forward",
        model_prefix + "training_data_split": "overlap_by_one",
        model_prefix + "elbo_beta_final": elbo_beta_final,
    })
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    sweeps.append({
        config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
        model_prefix + "latent_training_type": "forward_backward",
        model_prefix + "training_data_split": "include_inference",
        model_prefix + "elbo_beta_final": elbo_beta_final,
    })
  return model_config, sweeps
 def sym_metric_hgn_sweep():
  """HGN experimental sweep for the SyMetric paper."""
  model_config = copy.deepcopy(hgn_paper_config)
  model_config.name = "HGN"
  return model_config, list(dict())
 def benchmark_hgn_overlap_sweep():
  """HGN++ sweep for the benchmark paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "HGN"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    for train_dict in latent_training_sweep:
      sweeps.append({
          config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
          model_prefix + "elbo_beta_final": elbo_beta_final,
      })
      sweeps[-1].update(train_dict)
  return model_config, sweeps
 def benchmark_lgn_sweep():
  """LGN sweep for the benchmark paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "LGN"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    for train_dict in latent_training_sweep:
      sweeps.append({
          config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
          model_prefix + "latent_system_kwargs.kinetic_func_form":
              "matrix_dep_pure_quad",
          model_prefix + "elbo_beta_final": elbo_beta_final,
      })
      sweeps[-1].update(train_dict)
  return model_config, sweeps
 def benchmark_ode_sweep():
  """Neural ODE sweep for the benchmark paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "ODE"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    for integrator in ("adaptive", "rk2"):
      for train_dict in latent_training_sweep:
        sweeps.append({
            config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
            model_prefix + "integrator_method": integrator,
            model_prefix + "elbo_beta_final": elbo_beta_final,
        })
        sweeps[-1].update(train_dict)
  return model_config, sweeps
 def benchmark_rgn_sweep():
  """RGN sweep for the benchmark paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "RGN"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    for residual in (True, False):
      sweeps.append({
          config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
          model_prefix + "latent_system_kwargs.residual": residual,
          model_prefix + "elbo_beta_final": elbo_beta_final,
      })
  return model_config, sweeps
 def benchmark_ar_sweep():
  """AR sweep for the benchmark paper."""
  model_config = copy.deepcopy(default_config_dict)
  model_config.name = "AR"
  model_config.latent_dynamics_type = "vanilla"
  sweeps = list()
  for elbo_beta_final in [0.001, 0.1, 1.0, 2.0]:
    for ar_type in ("vanilla", "lstm", "gru"):
      sweeps.append({
          config_prefix + "optimizer.kwargs.learning_rate": 1.5e-4,
          model_prefix + "latent_dynamics_type": ar_type,
          model_prefix + "elbo_beta_final": elbo_beta_final,
      })
  return model_config, sweeps
@@ -0,0 +1,52 @@
 #!/bin/bash
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Script to execute a single configuration on all datasets.
 if [[ "$#" -eq 2 ]]; then
  readonly CONFIG_NAME="$1"
  readonly NUM_SWEEPS="$2"
 else
   echo "You must provide exactly two arguments - the configuration name and " \
   "how many sweeps it contains. For example:"
   echo "./launch_all.sh sym_metric_hgn_plus_plus_sweep 1"
   exit 2
 fi
 DATASETS=(
  "toy_physics/mass_spring"
  "toy_physics/mass_spring_colors"
  "toy_physics/mass_spring_colors_friction"
  "toy_physics/pendulum"
  "toy_physics/pendulum_colors"
  "toy_physics/pendulum_colors_friction"
  "toy_physics/two_body"
  "toy_physics/two_body_colors"
  "toy_physics/double_pendulum"
  "toy_physics/double_pendulum_colors"
  "toy_physics/double_pendulum_colors_friction"
  "molecular_dynamics/lj_4"
  "molecular_dynamics/lj_16"
  "multi_agent/rock_paper_scissors"
  "multi_agent/matching_pennies"
  "mujoco_room/circle"
  "mujoco_room/spiral"
 )
 readonly DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 for dataset in "${DATASETS[@]}"; do
  "${DIR}/launch_local.sh" "${CONFIG_NAME}" "${NUM_SWEEPS}" "${dataset}"
 done
@@ -0,0 +1,40 @@
 #!/bin/bash
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # A script to execute a single configuration name on a given dataset.
 if [[ "$#" -eq 3 ]]; then
  readonly CONFIG_NAME="$1"
  readonly NUM_SWEEPS="$2"
  readonly DATASET="$3"
 else
   echo "You must provide exactly three arguments - the configuration name, " \
   "the number of sweeps it contains and the dataset name. For example:"
   echo "./launch_local.sh sym_metric_hgn_plus_plus_sweep 1 " \
   "toy_physics/mass_spring"
   exit 2
 fi
 echo "Running with config ${CONFIG_NAME} on ${DATASET}."
 readonly EXPERIMENT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 readonly TRAIN_FILE="${EXPERIMENT_DIR}/jaxline_train.py"
 readonly CONFIG_FILE="${EXPERIMENT_DIR}/jaxline_configs.py"
 for sweep_id in $(seq 0 $((NUM_SWEEPS - 1))); do
  python3 "${TRAIN_FILE}" \
    --config="${CONFIG_FILE}:${CONFIG_NAME},${sweep_id},${DATASET}" \
    --jaxline_mode="train" \
    --logtostderr
 done
@@ -0,0 +1,247 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module containing code for computing various metrics for training and evaluation."""
 from typing import Callable, Dict, Optional
 import distrax
 import haiku as hk
 import jax
 import jax.nn as nn
 import jax.numpy as jnp
 import numpy as np
 import physics_inspired_models.utils as utils
 _ReconstructFunc = Callable[[utils.Params, jnp.ndarray, jnp.ndarray, bool],
                            distrax.Distribution]
 def calculate_small_latents(dist, threshold=0.5):
  """Calculates the number of active latents by thresholding the variance of their distribution."""
  if not isinstance(dist, distrax.Normal):
    raise NotImplementedError()
  latent_means = dist.mean()
  latent_stddevs = dist.variance()
  small_latents = jnp.sum(
      (latent_stddevs < threshold) & (jnp.abs(latent_means) > 0.1), axis=1)
  return jnp.mean(small_latents)
 def compute_scale(
    targets: jnp.ndarray,
    rescale_by: str
 ) -> jnp.ndarray:
  """Compute a scaling factor based on targets shape and the rescale_by argument."""
  if rescale_by == "pixels_and_time":
    return jnp.asarray(np.prod(targets.shape[-4:]))
  elif rescale_by is not None:
    raise ValueError(f"Unrecognized rescale_by={rescale_by}.")
  else:
    return jnp.ones([])
 def compute_data_domain_stats(
    p_x: distrax.Distribution,
    targets: jnp.ndarray
 ) -> Dict[str, jnp.ndarray]:
  """Compute several statistics in the data domain, such as L2 and negative log likelihood."""
  axis = tuple(range(2, targets.ndim))
  l2_over_time = jnp.sum((p_x.mean() - targets) ** 2, axis=axis)
  l2 = jnp.sum(l2_over_time, axis=1)
  # Calculate relative L2 normalised by image "length"
  norm_factor = jnp.sum(targets**2, axis=(2, 3, 4))
  l2_over_time_norm = l2_over_time / norm_factor
  l2_norm = jnp.sum(l2_over_time_norm, axis=1)
  # Compute negative log-likelihood under p(x)
  neg_log_p_x_over_time = - np.sum(p_x.log_prob(targets), axis=axis)
  neg_log_p_x = jnp.sum(neg_log_p_x_over_time, axis=1)
  return dict(
      neg_log_p_x_over_time=neg_log_p_x_over_time,
      neg_log_p_x=neg_log_p_x,
      l2_over_time=l2_over_time,
      l2=l2,
      l2_over_time_norm=l2_over_time_norm,
      l2_norm=l2_norm,
  )
 def compute_vae_stats(
    neg_log_p_x: jnp.ndarray,
    rng: jnp.ndarray,
    q_z: distrax.Distribution,
    prior: distrax.Distribution
 ) -> Dict[str, jnp.ndarray]:
  """Compute the KL(q(z|x)||p(z)) and the negative ELBO, which are used for VAE models."""
  # Compute the KL
  kl = distrax.estimate_kl_best_effort(q_z, prior, rng_key=rng, num_samples=1)
  kl = np.sum(kl, axis=list(range(1, kl.ndim)))
  # Sanity check
  assert kl.shape == neg_log_p_x.shape
  return dict(
      kl=kl,
      neg_elbo=neg_log_p_x + kl,
  )
 def training_statistics(
    p_x: distrax.Distribution,
    targets: jnp.ndarray,
    rescale_by: Optional[str],
    rng: Optional[jnp.ndarray] = None,
    q_z: Optional[distrax.Distribution] = None,
    prior: Optional[distrax.Distribution] = None,
    p_x_learned_sigma: bool = False
 ) -> Dict[str, jnp.ndarray]:
  """Computes various statistics we track during training."""
  stats = compute_data_domain_stats(p_x, targets)
  if rng is not None and q_z is not None and prior is not None:
    stats.update(compute_vae_stats(stats["neg_log_p_x"], rng, q_z, prior))
  else:
    assert rng is None and q_z is None and prior is None
  # Rescale these stats accordingly
  scale = compute_scale(targets, rescale_by)
  # Note that "_over_time" stats are getting normalised by time here
  stats = jax.tree_map(lambda x: x / scale, stats)
  if p_x_learned_sigma:
    stats["p_x_sigma"] = p_x.variance().reshape([-1])[0]
  if q_z is not None:
    stats["small_latents"] = calculate_small_latents(q_z)
  return stats
 def evaluation_only_statistics(
    reconstruct_func: _ReconstructFunc,
    params: hk.Params,
    inputs: jnp.ndarray,
    rng: jnp.ndarray,
    rescale_by: str,
    can_run_backwards: bool,
    train_sequence_length: int,
    reconstruction_skip: int,
    p_x_learned_sigma: bool = False,
 ) -> Dict[str, jnp.ndarray]:
  """Computes various statistics we track only during evaluation."""
  full_trajectory = utils.extract_image(inputs)
  prefixes = ("forward", "backward") if can_run_backwards else ("forward",)
  full_forward_targets = jax.tree_map(
      lambda x: x[:, reconstruction_skip:], full_trajectory)
  full_backward_targets = jax.tree_map(
      lambda x: x[:, :x.shape[1]-reconstruction_skip], full_trajectory)
  train_targets_length = train_sequence_length - reconstruction_skip
  full_targets_length = full_forward_targets.shape[1]
  stats = dict()
  keys = ()
  for prefix in prefixes:
    # Fully unroll the model and reconstruct the whole sequence
    full_prediction = reconstruct_func(params, full_trajectory, rng,
                                       prefix == "forward")
    assert isinstance(full_prediction, distrax.Normal)
    full_targets = (full_forward_targets if prefix == "forward" else
                    full_backward_targets)
    # In cases where the model can run backwards it is possible to reconstruct
    # parts which were indented to be skipped, so here we take care of that.
    if full_prediction.mean().shape[1] > full_targets_length:
      if prefix == "forward":
        full_prediction = jax.tree_map(lambda x: x[:, -full_targets_length:],
                                       full_prediction)
      else:
        full_prediction = jax.tree_map(lambda x: x[:, :full_targets_length],
                                       full_prediction)
    # Based on the prefix and suffix fetch correct predictions and targets
    for suffix in ("train", "extrapolation", "full"):
      if prefix == "forward" and suffix == "train":
        predict, targets = jax.tree_map(lambda x: x[:, :train_targets_length],
                                        (full_prediction, full_targets))
      elif prefix == "forward" and suffix == "extrapolation":
        predict, targets = jax.tree_map(lambda x: x[:, train_targets_length:],
                                        (full_prediction, full_targets))
      elif prefix == "backward" and suffix == "train":
        predict, targets = jax.tree_map(lambda x: x[:, -train_targets_length:],
                                        (full_prediction, full_targets))
      elif prefix == "backward" and suffix == "extrapolation":
        predict, targets = jax.tree_map(lambda x: x[:, :-train_targets_length],
                                        (full_prediction, full_targets))
      else:
        predict, targets = full_prediction, full_targets
      # Compute train statistics
      train_stats = training_statistics(predict, targets, rescale_by,
                                        p_x_learned_sigma=p_x_learned_sigma)
      for key, value in train_stats.items():
        stats[prefix + "_" + suffix + "_" + key] = value
      # Copy all stats keys
      keys = tuple(train_stats.keys())
  # Make a combined metric summing forward and backward
  if can_run_backwards:
    # Also compute
    for suffix in ("train", "extrapolation", "full"):
      for key in keys:
        forward = stats["forward_" + suffix + "_" + key]
        backward = stats["backward_" + suffix + "_" + key]
        combined = (forward + backward) / 2
        stats["combined_" + suffix + "_" + key] = combined
  return stats
 def geco_objective(
    l2_loss,
    kl,
    alpha,
    kappa,
    constraint_ema,
    lambda_var,
    is_training
 ) -> Dict[str, jnp.ndarray]:
  """Computes the objective for GECO and some of it statistics used ofr updates."""
  # C_t
  constraint_t = l2_loss - kappa
  if is_training:
    # We update C_ma only during training
    constraint_ema = alpha * constraint_ema + (1 - alpha) * constraint_t
  lagrange = nn.softplus(lambda_var)
  lagrange = jnp.broadcast_to(lagrange, constraint_ema.shape)
  # Add this special op for getting all gradients correct
  loss = utils.geco_lagrange_product(lagrange, constraint_ema, constraint_t)
  return dict(
      loss=loss + kl,
      geco_multiplier=lagrange,
      geco_constraint=constraint_t,
      geco_constraint_ema=constraint_ema
  )
 def elbo_objective(neg_log_p_x, kl, final_beta, beta_delay, step):
  """Computes objective for optimizing the Evidence Lower Bound (ELBO)."""
  if beta_delay == 0:
    beta = final_beta
  else:
    delayed_beta = jnp.minimum(float(step) / float(beta_delay), 1.0)
    beta = delayed_beta * final_beta
  return dict(
      loss=neg_log_p_x + beta * kl,
      elbo_beta=beta
  )
@@ -0,0 +1,14 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -0,0 +1,345 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module for all autoregressive models."""
 import functools
 from typing import Any, Dict, Mapping, Optional, Sequence, Tuple, Union
 import distrax
 import haiku as hk
 from jax import lax
 import jax.numpy as jnp
 import jax.random as jnr
 import physics_inspired_models.metrics as metrics
 import physics_inspired_models.models.base as base
 import physics_inspired_models.models.networks as nets
 import physics_inspired_models.utils as utils
 class TeacherForcingAutoregressiveModel(base.SequenceModel):
  """A standard autoregressive model trained via teacher forcing."""
  def __init__(
      self,
      latent_system_dim: int,
      latent_system_net_type: str,
      latent_system_kwargs: Dict[str, Any],
      latent_dynamics_type: str,
      encoder_aggregation_type: Optional[str],
      decoder_de_aggregation_type: Optional[str],
      encoder_kwargs: Dict[str, Any],
      decoder_kwargs: Dict[str, Any],
      num_inference_steps: int,
      num_target_steps: int,
      name: Optional[str] = None,
      **kwargs
  ):
    # Remove any parameters from vae models
    encoder_kwargs = dict(**encoder_kwargs)
    encoder_kwargs["distribution_name"] = None
    if kwargs.get("has_latent_transform", False):
      raise ValueError("We do not support AR models with latent transform.")
    super().__init__(
        can_run_backwards=False,
        latent_system_dim=latent_system_dim,
        latent_system_net_type=latent_system_net_type,
        latent_system_kwargs=latent_system_kwargs,
        encoder_aggregation_type=encoder_aggregation_type,
        decoder_de_aggregation_type=decoder_de_aggregation_type,
        encoder_kwargs=encoder_kwargs,
        decoder_kwargs=decoder_kwargs,
        num_inference_steps=num_inference_steps,
        num_target_steps=num_target_steps,
        name=name,
        **kwargs
    )
    self.latent_dynamics_type = latent_dynamics_type
    # Arguments checks
    if self.latent_system_net_type != "mlp":
      raise ValueError("Currently we do not support non-mlp AR models.")
    def recurrence_function(sequence, initial_state=None):
      core = nets.make_flexible_recurrent_net(
          core_type=latent_dynamics_type,
          net_type=latent_system_net_type,
          output_dims=self.latent_system_dim,
          **self.latent_system_kwargs["net_kwargs"])
      initial_state = initial_state or core.initial_state(sequence.shape[1])
      core(sequence[0], initial_state)
      return hk.dynamic_unroll(core, sequence, initial_state)
    self.recurrence = hk.transform(recurrence_function)
  def process_inputs_for_encoder(self, x: jnp.ndarray) -> jnp.ndarray:
    return x
  def process_latents_for_dynamics(self, z: jnp.ndarray) -> jnp.ndarray:
    return z
  def process_latents_for_decoder(self, z: jnp.ndarray) -> jnp.ndarray:
    return z
  @property
  def inferred_index(self) -> int:
    return self.num_inference_steps - 1
  @property
  def train_sequence_length(self) -> int:
    return self.num_target_steps
  def train_data_split(
      self,
      images: jnp.ndarray
  ) -> Tuple[jnp.ndarray, jnp.ndarray, Mapping[str, Any]]:
    images = images[:, :self.train_sequence_length]
    inference_data = images[:, :-1]
    target_data = images[:, 1:]
    return inference_data, target_data, dict(
        num_steps_forward=1,
        num_steps_backward=0,
        include_z0=False)
  def unroll_without_inputs(
      self,
      params: utils.Params,
      rng: jnp.ndarray,
      x_init: jnp.ndarray,
      h_init: jnp.ndarray,
      num_steps: int,
      is_training: bool
  ) -> Tuple[Tuple[distrax.Distribution, jnp.ndarray], Any]:
    if num_steps < 1:
      raise ValueError("`num_steps` must be at least 1.")
    def step_fn(carry, key):
      x_last, h_last = carry
      enc_key, dec_key = jnr.split(key)
      z_in_next = self.encoder.apply(params, enc_key, x_last,
                                     is_training=is_training)
      z_next, h_next = self.recurrence.apply(params, None, z_in_next[None],
                                             h_last)
      p_x_next = self.decode_latents(params, dec_key, z_next[0],
                                     is_training=is_training)
      return (p_x_next.mean(), h_next), (p_x_next, z_next[0])
    return lax.scan(
        step_fn,
        init=(x_init, h_init),
        xs=jnr.split(rng, num_steps)
    )
  def unroll_latent_dynamics(
      self,
      z: jnp.ndarray,
      params: utils.Params,
      key: jnp.ndarray,
      num_steps_forward: int,
      num_steps_backward: int,
      include_z0: bool,
      is_training: bool,
      **kwargs: Any
  ) -> Tuple[jnp.ndarray, Mapping[str, jnp.ndarray]]:
    init_key, unroll_key, dec_key = jnr.split(key, 3)
    if num_steps_backward != 0:
      raise ValueError("This model can not run backwards.")
    # Change 'z' time dimension to be first
    z = jnp.swapaxes(z, 0, 1)
    # Run recurrent model on inputs
    z_0, h_0 = self.recurrence.apply(params, init_key, z)
    if num_steps_forward == 1:
      z_t = z_0
    elif num_steps_forward > 1:
      p_x_0 = self.decode_latents(params, dec_key, z_0[-1], is_training=False)
      _, (_, z_t) = self.unroll_without_inputs(
          params=params,
          rng=unroll_key,
          x_init=p_x_0.mean(),
          h_init=h_0,
          num_steps=num_steps_forward-1,
          is_training=is_training
      )
      z_t = jnp.concatenate([z_0, z_t], axis=0)
    else:
      raise ValueError("num_steps_forward should be at least 1.")
    # Make time dimension second
    return jnp.swapaxes(z_t, 0, 1), dict()
  def _models_core(
      self,
      params: utils.Params,
      keys: jnp.ndarray,
      image_data: jnp.ndarray,
      is_training: bool,
      **unroll_kwargs: Any
  ) -> Tuple[distrax.Distribution, jnp.ndarray, jnp.ndarray]:
    enc_key, _, transform_key, unroll_key, dec_key, _ = keys
    # Calculate latent input representation
    inference_data = self.process_inputs_for_encoder(image_data)
    z_raw = self.encoder.apply(params, enc_key, inference_data,
                               is_training=is_training)
    # Apply latent transformation (should be identity)
    z0 = self.apply_latent_transform(params, transform_key, z_raw,
                                     is_training=is_training)
    z0 = self.process_latents_for_dynamics(z0)
    # Calculate latent output representation
    decoder_z, _ = self.unroll_latent_dynamics(
        z=z0,
        params=params,
        key=unroll_key,
        is_training=is_training,
        **unroll_kwargs
    )
    decoder_z = self.process_latents_for_decoder(decoder_z)
    # Compute p(x|z)
    p_x = self.decode_latents(params, dec_key, decoder_z,
                              is_training=is_training)
    return p_x, z0, decoder_z
  def training_objectives(
      self,
      params: hk.Params,
      state: hk.State,
      rng: jnp.ndarray,
      inputs: jnp.ndarray,
      step: jnp.ndarray,
      is_training: bool = True,
      use_mean_for_eval_stats: bool = True
  ) -> Tuple[jnp.ndarray, Sequence[Dict[str, jnp.ndarray]]]:
    """Computes the training objective and any supporting stats."""
    # Split all rng keys
    keys = jnr.split(rng, 6)
    # Process training data
    images = utils.extract_image(inputs)
    image_data, target_data, unroll_kwargs = self.train_data_split(images)
    p_x, _, _ = self._models_core(
        params=params,
        keys=keys,
        image_data=image_data,
        is_training=is_training,
        **unroll_kwargs
    )
    # Compute training statistics
    stats = metrics.training_statistics(
        p_x=p_x,
        targets=target_data,
        rescale_by=self.rescale_by,
        p_x_learned_sigma=self.decoder_kwargs.get("learned_sigma", False)
    )
    # The loss is just the negative log-likelihood (e.g. the L2 loss)
    stats["loss"] = stats["neg_log_p_x"]
    if not is_training:
      # Optionally add the evaluation stats when not training
      # Add also the evaluation statistics
      # We need to be able to set `use_mean = False` for some of the tests
      stats.update(metrics.evaluation_only_statistics(
          reconstruct_func=functools.partial(
              self.reconstruct, use_mean=use_mean_for_eval_stats),
          params=params,
          inputs=inputs,
          rng=rng,
          rescale_by=self.rescale_by,
          can_run_backwards=self.can_run_backwards,
          train_sequence_length=self.train_sequence_length,
          reconstruction_skip=1,
          p_x_learned_sigma=self.decoder_kwargs.get("learned_sigma", False)
      ))
    return stats["loss"], (dict(), stats, dict())
  def reconstruct(
      self,
      params: utils.Params,
      inputs: jnp.ndarray,
      rng: jnp.ndarray,
      forward: bool,
      use_mean: bool = True,
  ) -> distrax.Distribution:
    """Reconstructs the input sequence."""
    if not forward:
      raise ValueError("This model can not run backwards.")
    images = utils.extract_image(inputs)
    image_data = images[:, :self.num_inference_steps]
    return self._models_core(
        params=params,
        keys=jnr.split(rng, 6),
        image_data=image_data,
        is_training=False,
        num_steps_forward=images.shape[1] - self.num_inference_steps,
        num_steps_backward=0,
        include_z0=False,
    )[0]
  def gt_state_and_latents(
      self,
      params: hk.Params,
      rng: jnp.ndarray,
      inputs: Dict[str, jnp.ndarray],
      seq_length: int,
      is_training: bool = False,
      unroll_direction: str = "forward",
      **kwargs: Dict[str, Any]
  ) -> Tuple[jnp.ndarray, jnp.ndarray,
             Union[distrax.Distribution, jnp.ndarray]]:
    """Computes the ground state and matching latents."""
    assert unroll_direction == "forward"
    images = utils.extract_image(inputs)
    gt_state = utils.extract_gt_state(inputs)
    image_data = images[:, :self.num_inference_steps]
    gt_state = gt_state[:, 1:seq_length + 1]
    _, z_in, z_out = self._models_core(
        params=params,
        keys=jnr.split(rng, 6),
        image_data=image_data,
        is_training=False,
        num_steps_forward=images.shape[1] - self.num_inference_steps,
        num_steps_backward=0,
        include_z0=False,
    )
    return gt_state, z_out, z_in
  def _init_non_model_params_and_state(
      self,
      rng: jnp.ndarray
  ) -> Tuple[Dict[str, jnp.ndarray], Dict[str, jnp.ndarray]]:
    return dict(), dict()
  def _init_latent_system(
      self,
      rng: jnp.ndarray,
      z: jnp.ndarray,
      **kwargs: Any
  ) -> utils.Params:
    return self.recurrence.init(rng, z)
@@ -0,0 +1,360 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module containing the base abstract classes for sequence models."""
 import abc
 from typing import Any, Dict, Generic, Mapping, Optional, Sequence, Tuple, TypeVar, Union
 from absl import logging
 import distrax
 import haiku as hk
 import jax
 import jax.numpy as jnp
 import jax.random as jnr
 from physics_inspired_models import utils
 from physics_inspired_models.models import networks
 T = TypeVar("T")
 class SequenceModel(abc.ABC, Generic[T]):
  """An abstract class for sequence models."""
  def __init__(
      self,
      can_run_backwards: bool,
      latent_system_dim: int,
      latent_system_net_type: str,
      latent_system_kwargs: Dict[str, Any],
      encoder_aggregation_type: Optional[str],
      decoder_de_aggregation_type: Optional[str],
      encoder_kwargs: Dict[str, Any],
      decoder_kwargs: Dict[str, Any],
      num_inference_steps: int,
      num_target_steps: int,
      name: str,
      latent_spatial_shape: Optional[Tuple[int, int]] = (4, 4),
      has_latent_transform: bool = False,
      latent_transform_kwargs: Optional[Dict[str, Any]] = None,
      rescale_by: Optional[str] = "pixels_and_time",
      data_format: str = "NHWC",
      **unused_kwargs
  ):
    # Arguments checks
    encoder_kwargs = encoder_kwargs or dict()
    decoder_kwargs = decoder_kwargs or dict()
    # Set the decoder de-aggregation type the "same" type as the encoder if not
    # provided
    if (decoder_de_aggregation_type is None and
        encoder_aggregation_type is not None):
      if encoder_aggregation_type == "linear_projection":
        decoder_de_aggregation_type = "linear_projection"
      elif encoder_aggregation_type in ("mean", "max"):
        decoder_de_aggregation_type = "tile"
      else:
        raise ValueError(f"Unrecognized encoder_aggregation_type="
                         f"{encoder_aggregation_type}")
    if latent_system_net_type == "conv":
      if encoder_aggregation_type is not None:
        raise ValueError("When the latent system is convolutional, the encoder "
                         "aggregation type should be None.")
      if decoder_de_aggregation_type is not None:
        raise ValueError("When the latent system is convolutional, the decoder "
                         "aggregation type should be None.")
    else:
      if encoder_aggregation_type is None:
        raise ValueError("When the latent system is not convolutional, the "
                         "you must provide an encoder aggregation type.")
      if decoder_de_aggregation_type is None:
        raise ValueError("When the latent system is not convolutional, the "
                         "you must provide an decoder aggregation type.")
    if has_latent_transform and latent_transform_kwargs is None:
      raise ValueError("When using latent transformation you have to provide "
                       "the latent_transform_kwargs argument.")
    if unused_kwargs:
      logging.warning("Unused kwargs: %s", str(unused_kwargs))
    super().__init__(**unused_kwargs)
    self.can_run_backwards = can_run_backwards
    self.latent_system_dim = latent_system_dim
    self.latent_system_kwargs = latent_system_kwargs
    self.latent_system_net_type = latent_system_net_type
    self.latent_spatial_shape = latent_spatial_shape
    self.num_inference_steps = num_inference_steps
    self.num_target_steps = num_target_steps
    self.rescale_by = rescale_by
    self.data_format = data_format
    self.name = name
    # Encoder
    self.encoder_kwargs = encoder_kwargs
    self.encoder = hk.transform(
        lambda *args, **kwargs: networks.SpatialConvEncoder(  # pylint: disable=unnecessary-lambda,g-long-lambda
            latent_dim=latent_system_dim,
            aggregation_type=encoder_aggregation_type,
            data_format=data_format,
            name="Encoder",
            **encoder_kwargs
        )(*args, **kwargs))
    # Decoder
    self.decoder_kwargs = decoder_kwargs
    self.decoder = hk.transform(
        lambda *args, **kwargs: networks.SpatialConvDecoder(  # pylint: disable=unnecessary-lambda,g-long-lambda
            initial_spatial_shape=self.latent_spatial_shape,
            de_aggregation_type=decoder_de_aggregation_type,
            data_format=data_format,
            max_de_aggregation_dims=self.latent_system_dim // 2,
            name="Decoder",
            **decoder_kwargs,
        )(*args, **kwargs))
    self.has_latent_transform = has_latent_transform
    if has_latent_transform:
      self.latent_transform = hk.transform(
          lambda *args, **kwargs: networks.make_flexible_net(  # pylint: disable=unnecessary-lambda,g-long-lambda
              net_type=latent_system_net_type,
              output_dims=latent_system_dim,
              name="LatentTransform",
              **latent_transform_kwargs
          )(*args, **kwargs))
    else:
      self.latent_transform = None
    self._jit_init = None
  @property
  @abc.abstractmethod
  def train_sequence_length(self) -> int:
    """Computes the total length of a sequence needed for training or evaluation."""
    pass
  @abc.abstractmethod
  def train_data_split(
      self,
      images: jnp.ndarray,
  ) -> Tuple[jnp.ndarray, jnp.ndarray, Mapping[str, Any]]:
    """Extracts from the inputs the data splits for training."""
    pass
  def decode_latents(
      self,
      params: hk.Params,
      rng: jnp.ndarray,
      z: jnp.ndarray,
      **kwargs: Any
  ) -> distrax.Distribution:
    """Decodes the latent variable given the parameters of the model."""
    # Allow to run with both the full parameters and only the decoders
    if self.latent_system_net_type == "mlp":
      fixed_dims = 1
    elif self.latent_system_net_type == "conv":
      fixed_dims = 1 + len(self.latent_spatial_shape)
    else:
      raise NotImplementedError()
    n_shape = z.shape[:-fixed_dims]
    z = z.reshape((-1,) + z.shape[-fixed_dims:])
    x = self.decoder.apply(params, rng, z, **kwargs)
    return jax.tree_map(lambda a: a.reshape(n_shape + a.shape[1:]), x)
  def apply_latent_transform(
      self,
      params: hk.Params,
      key: jnp.ndarray,
      z: jnp.ndarray,
      **kwargs: Any
  ) -> jnp.ndarray:
    if self.latent_transform is not None:
      return self.latent_transform.apply(params, key, z, **kwargs)
    else:
      return z
  @abc.abstractmethod
  def process_inputs_for_encoder(self, x: jnp.ndarray) -> jnp.ndarray:
    pass
  @abc.abstractmethod
  def process_latents_for_dynamics(self, z: jnp.ndarray) -> T:
    pass
  @abc.abstractmethod
  def process_latents_for_decoder(self, z: T) -> jnp.ndarray:
    pass
  @abc.abstractmethod
  def unroll_latent_dynamics(
      self,
      z: T,
      params: utils.Params,
      key: jnp.ndarray,
      num_steps_forward: int,
      num_steps_backward: int,
      include_z0: bool,
      is_training: bool,
      **kwargs: Any
  ) -> Tuple[T, Mapping[str, jnp.ndarray]]:
    """Unrolls the latent dynamics starting from z and pre-processing for the decoder."""
    pass
  @abc.abstractmethod
  def reconstruct(
      self,
      params: utils.Params,
      inputs: jnp.ndarray,
      rng_key: Optional[jnp.ndarray],
      forward: bool,
  ) -> distrax.Distribution:
    """Using the first `num_inference_steps` parts of inputs reconstructs the rest."""
    pass
  @abc.abstractmethod
  def training_objectives(
      self,
      params: utils.Params,
      state: hk.State,
      rng: jnp.ndarray,
      inputs: Union[Dict[str, jnp.ndarray], jnp.ndarray],
      step: jnp.ndarray,
      is_training: bool = True,
      use_mean_for_eval_stats: bool = True
  ) -> Tuple[jnp.ndarray, Sequence[Dict[str, jnp.ndarray]]]:
    """Returns all training objectives statistics and update states."""
    pass
  @property
  @abc.abstractmethod
  def inferred_index(self):
    """Returns the time index in the input sequence, for which the encoder infers.
    If the encoder takes as input the sequence x[0:n-1], where
    `n = self.num_inference_steps`, then this outputs the index `k` relative to
    the begging of the input sequence `x_0`, which the encoder infers.
    """
    pass
  @property
  def inferred_right_offset(self):
    return self.num_inference_steps - 1 - self.inferred_index
  @abc.abstractmethod
  def gt_state_and_latents(
      self,
      params: hk.Params,
      rng: jnp.ndarray,
      inputs: Dict[str, jnp.ndarray],
      seq_len: int,
      is_training: bool = False,
      unroll_direction: str = "forward",
      **kwargs: Dict[str, Any]
  ) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
    """Computes the ground state and matching latents."""
    pass
  @abc.abstractmethod
  def _init_non_model_params_and_state(
      self,
      rng: jnp.ndarray
  ) -> Tuple[utils.Params, utils.Params]:
    """Initializes any non-model parameters and state."""
    pass
  @abc.abstractmethod
  def _init_latent_system(
      self,
      rng: jnp.ndarray,
      z: jnp.ndarray,
      **kwargs: Any
  ) -> hk.Params:
    """Initializes the parameters of the latent system."""
    pass
  def _init(
      self,
      rng: jnp.ndarray,
      images: jnp.ndarray
  ) -> Tuple[hk.Params, hk.State]:
    """Initializes the whole model parameters and state."""
    inference_data, _, _ = self.train_data_split(images)
    # Initialize parameters and state for the vae training
    rng, key = jnr.split(rng)
    params, state = self._init_non_model_params_and_state(key)
    # Initialize and run encoder
    inference_data = self.process_inputs_for_encoder(inference_data)
    rng, key = jnr.split(rng)
    encoder_params = self.encoder.init(key, inference_data, is_training=True)
    rng, key = jnr.split(rng)
    z_in = self.encoder.apply(encoder_params, key, inference_data,
                              is_training=True)
    # For probabilistic models this will be a distribution
    if isinstance(z_in, distrax.Distribution):
      z_in = z_in.mean()
    # Initialize and run the optional latent transform
    if self.latent_transform is not None:
      rng, key = jnr.split(rng)
      transform_params = self.latent_transform.init(key, z_in, is_training=True)
      rng, key = jnr.split(rng)
      z_in = self.latent_transform.apply(transform_params, key, z_in,
                                         is_training=True)
    else:
      transform_params = dict()
    # Initialize and run the latent system
    z_in = self.process_latents_for_dynamics(z_in)
    rng, key = jnr.split(rng)
    latent_params = self._init_latent_system(key, z_in, is_training=True)
    rng, key = jnr.split(rng)
    z_out, _ = self.unroll_latent_dynamics(
        z=z_in,
        params=latent_params,
        key=key,
        num_steps_forward=1,
        num_steps_backward=0,
        include_z0=False,
        is_training=True
    )
    z_out = self.process_latents_for_decoder(z_out)
    # Initialize and run the decoder
    rng, key = jnr.split(rng)
    decoder_params = self.decoder.init(key, z_out[:, 0], is_training=True)
    _ = self.decoder.apply(decoder_params, rng, z_out[:, 0], is_training=True)
    # Combine all and make immutable
    params = hk.data_structures.merge(params, encoder_params, transform_params,
                                      latent_params, decoder_params)
    params = hk.data_structures.to_immutable_dict(params)
    state = hk.data_structures.to_immutable_dict(state)
    return params, state
  def init(
      self,
      rng: jnp.ndarray,
      inputs_or_shape: Union[jnp.ndarray, Mapping[str, jnp.ndarray],
                             Sequence[int]],
  ) -> Tuple[utils.Params, hk.State]:
    """Initializes the whole model parameters and state."""
    if (isinstance(inputs_or_shape, (tuple, list))
        and isinstance(inputs_or_shape[0], int)):
      images = jnp.zeros(inputs_or_shape)
    else:
      images = utils.extract_image(inputs_or_shape)
    if self._jit_init is None:
      self._jit_init = jax.jit(self._init)
    return self._jit_init(rng, images)
@@ -0,0 +1,117 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module for all models."""
 from typing import Any, Dict, Optional
 import physics_inspired_models.models.autoregressive as autoregressive
 import physics_inspired_models.models.deterministic_vae as deterministic_vae
 _physics_arguments = (
    "input_space", "simulation_space", "potential_func_form",
    "kinetic_func_form", "hgn_kinetic_func_form", "lgn_kinetic_func_form",
    "parametrize_mass_matrix", "hgn_parametrize_mass_matrix",
    "lgn_parametrize_mass_matrix", "mass_eps"
 )
 def construct_model(
    name: str,
    *args,
    **kwargs: Dict[str, Any]
 ):
  """Constructs the correct instance of a model given the short name."""
  latent_dynamics_type: Optional[str] = kwargs.pop("latent_dynamics_type", None)  # pytype: disable=annotation-type-mismatch
  latent_system_kwargs = dict(**kwargs.pop("latent_system_kwargs", dict()))
  if name == "AR":
    assert latent_dynamics_type in ("vanilla", "lstm", "gru")
    # This arguments are not part of the AR models
    for k in _physics_arguments + ("integrator_method", "residual"):
      latent_system_kwargs.pop(k, None)
    return autoregressive.TeacherForcingAutoregressiveModel(
        *args,
        latent_dynamics_type=latent_dynamics_type,
        latent_system_kwargs=latent_system_kwargs,
        **kwargs
    )
  elif name == "RGN":
    assert latent_dynamics_type in ("Discrete", None)
    latent_dynamics_type = "Discrete"
    # This arguments are not part of the RGN models
    for k in _physics_arguments + ("integrator_method",):
      latent_system_kwargs.pop(k, None)
  elif name == "ODE":
    assert latent_dynamics_type in ("ODE", None)
    latent_dynamics_type = "ODE"
    # This arguments are not part of the ODE models
    for k in _physics_arguments + ("residual",):
      latent_system_kwargs.pop(k, None)
  elif name == "HGN":
    assert latent_dynamics_type in ("Physics", None)
    latent_dynamics_type = "Physics"
    assert latent_system_kwargs.get("input_space", None) in ("momentum", None)
    latent_system_kwargs["input_space"] = "momentum"
    assert (latent_system_kwargs.get("simulation_space", None)
            in ("momentum", None))
    latent_system_kwargs["simulation_space"] = "momentum"
    # Kinetic func form
    hgn_specific = latent_system_kwargs.pop("hgn_kinetic_func_form", None)
    if hgn_specific is not None:
      latent_system_kwargs["kinetic_func_form"] = hgn_specific
    # Mass matrix
    hgn_specific = latent_system_kwargs.pop("hgn_parametrize_mass_matrix",
                                            None)
    if hgn_specific is not None:
      latent_system_kwargs["parametrize_mass_matrix"] = hgn_specific
    # This arguments are not part of the HGN models
    latent_system_kwargs.pop("residual", None)
    latent_system_kwargs.pop("lgn_kinetic_func_form", None)
    latent_system_kwargs.pop("lgn_parametrize_mass_matrix", None)
  elif name == "LGN":
    assert latent_dynamics_type in ("Physics", None)
    latent_dynamics_type = "Physics"
    assert latent_system_kwargs.get("input_space", None) in ("velocity", None)
    latent_system_kwargs["input_space"] = "velocity"
    assert (latent_system_kwargs.get("simulation_space", None) in
            ("velocity", None))
    latent_system_kwargs["simulation_space"] = "velocity"
    # Kinetic func form
    lgn_specific = latent_system_kwargs.pop("lgn_kinetic_func_form", None)
    if lgn_specific is not None:
      latent_system_kwargs["kinetic_func_form"] = lgn_specific
    # Mass matrix
    lgn_specific = latent_system_kwargs.pop("lgn_parametrize_mass_matrix",
                                            None)
    if lgn_specific is not None:
      latent_system_kwargs["parametrize_mass_matrix"] = lgn_specific
    # This arguments are not part of the HGN models
    latent_system_kwargs.pop("residual", None)
    latent_system_kwargs.pop("hgn_kinetic_func_form", None)
    latent_system_kwargs.pop("hgn_parametrize_mass_matrix", None)
  elif name == "PGN":
    assert latent_dynamics_type in ("Physics", None)
    latent_dynamics_type = "Physics"
    # This arguments are not part of the PGN models
    latent_system_kwargs.pop("residual")
    latent_system_kwargs.pop("hgn_kinetic_func_form", None)
    latent_system_kwargs.pop("hgn_parametrize_mass_matrix", None)
    latent_system_kwargs.pop("lgn_kinetic_func_form", None)
    latent_system_kwargs.pop("lgn_parametrize_mass_matrix", None)
  else:
    raise NotImplementedError()
  return deterministic_vae.DeterministicLatentsGenerativeModel(
      *args,
      latent_dynamics_type=latent_dynamics_type,
      latent_system_kwargs=latent_system_kwargs,
      **kwargs)
@@ -0,0 +1,494 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module containing all of the networks as Haiku modules."""
 from typing import Any, Callable, Mapping, Optional, Sequence, Union
 from absl import logging
 import distrax
 import haiku as hk
 import jax.numpy as jnp
 from physics_inspired_models import utils
 Activation = Union[str, Callable[[jnp.ndarray], jnp.ndarray]]
 class DenseNet(hk.Module):
  """A feed forward network (MLP)."""
  def __init__(
      self,
      num_units: Sequence[int],
      activate_final: bool = False,
      activation: Activation = "leaky_relu",
      name: Optional[str] = None):
    super().__init__(name=name)
    self.num_units = num_units
    self.num_layers = len(self.num_units)
    self.activate_final = activate_final
    self.activation = utils.get_activation(activation)
    self.linear_modules = []
    for i in range(self.num_layers):
      self.linear_modules.append(
          hk.Linear(
              output_size=self.num_units[i],
              name=f"ff_{i}"
          )
      )
  def __call__(self, inputs: jnp.ndarray, is_training: bool):
    net = inputs
    for i, linear in enumerate(self.linear_modules):
      net = linear(net)
      if i < self.num_layers - 1 or self.activate_final:
        net = self.activation(net)
    return net
 class Conv2DNet(hk.Module):
  """Convolutional Network."""
  def __init__(
      self,
      output_channels: Sequence[int],
      kernel_shapes: Union[int, Sequence[int]] = 3,
      strides: Union[int, Sequence[int]] = 1,
      padding: Union[str, Sequence[str]] = "SAME",
      data_format: str = "NHWC",
      with_batch_norm: bool = False,
      activate_final: bool = False,
      activation: Activation = "leaky_relu",
      name: Optional[str] = None):
    super().__init__(name=name)
    self.output_channels = tuple(output_channels)
    self.num_layers = len(self.output_channels)
    self.kernel_shapes = utils.bcast_if(kernel_shapes, int, self.num_layers)
    self.strides = utils.bcast_if(strides, int, self.num_layers)
    self.padding = utils.bcast_if(padding, str, self.num_layers)
    self.data_format = data_format
    self.with_batch_norm = with_batch_norm
    self.activate_final = activate_final
    self.activation = utils.get_activation(activation)
    if len(self.kernel_shapes) != self.num_layers:
      raise ValueError(f"Kernel shapes is of size {len(self.kernel_shapes)}, "
                       f"while output_channels is of size{self.num_layers}.")
    if len(self.strides) != self.num_layers:
      raise ValueError(f"Strides is of size {len(self.kernel_shapes)}, while "
                       f"output_channels is of size{self.num_layers}.")
    if len(self.padding) != self.num_layers:
      raise ValueError(f"Padding is of size {len(self.padding)}, while "
                       f"output_channels is of size{self.num_layers}.")
    self.conv_modules = []
    self.bn_modules = []
    for i in range(self.num_layers):
      self.conv_modules.append(
          hk.Conv2D(
              output_channels=self.output_channels[i],
              kernel_shape=self.kernel_shapes[i],
              stride=self.strides[i],
              padding=self.padding[i],
              data_format=data_format,
              name=f"conv_2d_{i}")
      )
      if with_batch_norm:
        self.bn_modules.append(
            hk.BatchNorm(
                create_offset=True,
                create_scale=False,
                decay_rate=0.999,
                name=f"batch_norm_{i}")
        )
      else:
        self.bn_modules.append(None)
  def __call__(self, inputs: jnp.ndarray, is_training: bool):
    assert inputs.ndim == 4
    net = inputs
    for i, (conv, bn) in enumerate(zip(self.conv_modules, self.bn_modules)):
      net = conv(net)
      # Batch norm
      if bn is not None:
        net = bn(net, is_training=is_training)
      if i < self.num_layers - 1 or self.activate_final:
        net = self.activation(net)
    return net
 class SpatialConvEncoder(hk.Module):
  """Spatial Convolutional Encoder for learning the Hamiltonian."""
  def __init__(
      self,
      latent_dim: int,
      conv_channels: Union[Sequence[int], int],
      num_blocks: int,
      blocks_depth: int = 2,
      distribution_name: str = "diagonal_normal",
      aggregation_type: Optional[str] = None,
      data_format: str = "NHWC",
      activation: Activation = "leaky_relu",
      scale_factor: int = 2,
      kernel_shapes: Union[Sequence[int], int] = 3,
      padding: Union[Sequence[str], str] = "SAME",
      name: Optional[str] = None):
    super().__init__(name=name)
    if aggregation_type not in (None, "max", "mean", "linear_projection"):
      raise ValueError(f"Unrecognized aggregation_type={aggregation_type}.")
    self.latent_dim = latent_dim
    self.conv_channels = conv_channels
    self.num_blocks = num_blocks
    self.scale_factor = scale_factor
    self.data_format = data_format
    self.distribution_name = distribution_name
    self.aggregation_type = aggregation_type
    # Compute the required size of the output
    if distribution_name is None:
      self.output_dim = latent_dim
    elif distribution_name == "diagonal_normal":
      self.output_dim = 2 * latent_dim
    else:
      raise ValueError(f"Unrecognized distribution_name={distribution_name}.")
    if isinstance(conv_channels, int):
      conv_channels = [[conv_channels] * blocks_depth
                       for _ in range(num_blocks)]
      conv_channels[-1] += [self.output_dim]
    else:
      assert isinstance(conv_channels, (list, tuple))
      assert len(conv_channels) == num_blocks
      conv_channels = list(list(c) for c in conv_channels)
      conv_channels[-1].append(self.output_dim)
    if isinstance(kernel_shapes, tuple):
      kernel_shapes = list(kernel_shapes)
    # Convolutional blocks
    self.blocks = []
    for i, channels in enumerate(conv_channels):
      if isinstance(kernel_shapes, int):
        extra_kernel_shapes = 0
      else:
        extra_kernel_shapes = [3] * (len(channels) - len(kernel_shapes))
      self.blocks.append(Conv2DNet(
          output_channels=channels,
          kernel_shapes=kernel_shapes + extra_kernel_shapes,
          strides=[self.scale_factor] + [1] * (len(channels) - 1),
          padding=padding,
          data_format=data_format,
          with_batch_norm=False,
          activate_final=i < num_blocks - 1,
          activation=activation,
          name=f"block_{i}"
      ))
  def spatial_aggregation(self, x: jnp.ndarray) -> jnp.ndarray:
    if self.aggregation_type is None:
      return x
    axis = (1, 2) if self.data_format == "NHWC" else (2, 3)
    if self.aggregation_type == "max":
      return jnp.max(x, axis=axis)
    if self.aggregation_type == "mean":
      return jnp.mean(x, axis=axis)
    if self.aggregation_type == "linear_projection":
      x = x.reshape(x.shape[:-3] + (-1,))
      return hk.Linear(self.output_dim, name="LinearProjection")(x)
    raise NotImplementedError()
  def make_distribution(self, net_output: jnp.ndarray) -> distrax.Distribution:
    if self.distribution_name is None:
      return net_output
    elif self.distribution_name == "diagonal_normal":
      if self.aggregation_type is None:
        split_axis, num_axes = self.data_format.index("C"), 3
      else:
        split_axis, num_axes = 1, 1
      # Add an extra axis if the input has more than 1 batch dimension
      split_axis += net_output.ndim - num_axes - 1
      loc, log_scale = jnp.split(net_output, 2, axis=split_axis)
      return distrax.Normal(loc, jnp.exp(log_scale))
    else:
      raise NotImplementedError()
  def __call__(
      self,
      inputs: jnp.ndarray,
      is_training: bool
  ) -> Union[jnp.ndarray, distrax.Distribution]:
    # Treat any extra dimensions (like time) as the batch
    batched_shape = inputs.shape[:-3]
    net = jnp.reshape(inputs, (-1,) + inputs.shape[-3:])
    # Apply all blocks in sequence
    for block in self.blocks:
      net = block(net, is_training=is_training)
    # Final projection
    net = self.spatial_aggregation(net)
    # Reshape back to correct dimensions (like batch + time)
    net = jnp.reshape(net, batched_shape + net.shape[1:])
    # Return a distribution over the observations
    return self.make_distribution(net)
 class SpatialConvDecoder(hk.Module):
  """Spatial Convolutional Decoder for learning the Hamiltonian."""
  def __init__(
      self,
      initial_spatial_shape: Sequence[int],
      conv_channels: Union[Sequence[int], int],
      num_blocks: int,
      max_de_aggregation_dims: int,
      blocks_depth: int = 2,
      scale_factor: int = 2,
      output_channels: int = 3,
      h_const_channels: int = 2,
      data_format: str = "NHWC",
      activation: Activation = "leaky_relu",
      learned_sigma: bool = False,
      de_aggregation_type: Optional[str] = None,
      final_activation: Activation = "sigmoid",
      discard_half_de_aggregated: bool = False,
      kernel_shapes: Union[Sequence[int], int] = 3,
      padding: Union[Sequence[str], str] = "SAME",
      name: Optional[str] = None):
    super().__init__(name=name)
    if de_aggregation_type not in (None, "tile", "linear_projection"):
      raise ValueError(f"Unrecognized de_aggregation_type="
                       f"{de_aggregation_type}.")
    self.num_blocks = num_blocks
    self.scale_factor = scale_factor
    self.h_const_channels = h_const_channels
    self.data_format = data_format
    self.learned_sigma = learned_sigma
    self.initial_spatial_shape = tuple(initial_spatial_shape)
    self.final_activation = utils.get_activation(final_activation)
    self.de_aggregation_type = de_aggregation_type
    self.max_de_aggregation_dims = max_de_aggregation_dims
    self.discard_half_de_aggregated = discard_half_de_aggregated
    if isinstance(conv_channels, int):
      conv_channels = [[conv_channels] * blocks_depth
                       for _ in range(num_blocks)]
      conv_channels[-1] += [output_channels]
    else:
      assert isinstance(conv_channels, (list, tuple))
      assert len(conv_channels) == num_blocks
      conv_channels = list(list(c) for c in conv_channels)
      conv_channels[-1].append(output_channels)
    # Convolutional blocks
    self.blocks = []
    for i, channels in enumerate(conv_channels):
      is_final_block = i == num_blocks - 1
      self.blocks.append(
          Conv2DNet(  # pylint: disable=g-complex-comprehension
              output_channels=channels,
              kernel_shapes=kernel_shapes,
              strides=1,
              padding=padding,
              data_format=data_format,
              with_batch_norm=False,
              activate_final=not is_final_block,
              activation=activation,
              name=f"block_{i}"
          ))
  def spatial_de_aggregation(self, x: jnp.ndarray) -> jnp.ndarray:
    if self.de_aggregation_type is None:
      assert x.ndim >= 4
      if self.data_format == "NHWC":
        assert x.shape[1:3] == self.initial_spatial_shape
      elif self.data_format == "NCHW":
        assert x.shape[2:4] == self.initial_spatial_shape
      return x
    elif self.de_aggregation_type == "linear_projection":
      assert x.ndim == 2
      n, d = x.shape
      d = min(d, self.max_de_aggregation_dims or d)
      out_d = d * self.initial_spatial_shape[0] * self.initial_spatial_shape[1]
      x = hk.Linear(out_d, name="LinearProjection")(x)
      if self.data_format == "NHWC":
        shape = (n,) + self.initial_spatial_shape + (d,)
      else:
        shape = (n, d) + self.initial_spatial_shape
      return x.reshape(shape)
    elif self.de_aggregation_type == "tile":
      assert x.ndim == 2
      if self.data_format == "NHWC":
        repeats = (1,) + self.initial_spatial_shape + (1,)
        x = x[:, None, None, :]
      else:
        repeats = (1, 1) + self.initial_spatial_shape
        x = x[:, :, None, None]
      return jnp.tile(x, repeats)
    else:
      raise NotImplementedError()
  def add_constant_channels(self, inputs: jnp.ndarray) -> jnp.ndarray:
    # --------------------------------------------
    # This is purely for TF compatibility purposes
    if self.discard_half_de_aggregated:
      axis = self.data_format.index("C")
      inputs, _ = jnp.split(inputs, 2, axis=axis)
    # --------------------------------------------
    # An extra constant channels
    if self.data_format == "NHWC":
      h_shape = self.initial_spatial_shape + (self.h_const_channels,)
    else:
      h_shape = (self.h_const_channels,) + self.initial_spatial_shape
    h_const = hk.get_parameter("h", h_shape, dtype=inputs.dtype,
                               init=hk.initializers.Constant(1))
    h_const = jnp.tile(h_const, reps=[inputs.shape[0], 1, 1, 1])
    return jnp.concatenate([h_const, inputs], axis=self.data_format.index("C"))
  def make_distribution(self, net_output: jnp.ndarray) -> distrax.Distribution:
    if self.learned_sigma:
      init = hk.initializers.Constant(- jnp.log(2.0) / 2.0)
      log_scale = hk.get_parameter("log_scale", shape=(),
                                   dtype=net_output.dtype, init=init)
      scale = jnp.full_like(net_output, jnp.exp(log_scale))
    else:
      scale = jnp.full_like(net_output, 1 / jnp.sqrt(2.0))
    return distrax.Normal(net_output, scale)
  def __call__(
      self,
      inputs: jnp.ndarray,
      is_training: bool
  ) -> distrax.Distribution:
    # Apply the spatial de-aggregation
    inputs = self.spatial_de_aggregation(inputs)
    # Add the parameterized constant channels
    net = self.add_constant_channels(inputs)
    # Apply all the blocks
    for block in self.blocks:
      # Up-sample the image
      net = utils.nearest_neighbour_upsampling(net, self.scale_factor)
      # Apply the convolutional block
      net = block(net, is_training=is_training)
    # Apply any specific output nonlinearity
    net = self.final_activation(net)
    # Construct the distribution over the observations
    return self.make_distribution(net)
 def make_flexible_net(
    net_type: str,
    output_dims: int,
    conv_channels: Union[Sequence[int], int],
    num_units: Union[Sequence[int], int],
    num_layers: Optional[int],
    activation: Activation,
    activate_final: bool = False,
    kernel_shapes: Union[Sequence[int], int] = 3,
    strides: Union[Sequence[int], int] = 1,
    padding: Union[Sequence[str], str] = "SAME",
    name: Optional[str] = None,
    **unused_kwargs: Mapping[str, Any]
 ):
  """Commonly used for creating a flexible network."""
  if unused_kwargs:
    logging.warning("Unused kwargs of `make_flexible_net`: %s",
                    str(unused_kwargs))
  if net_type == "mlp":
    if isinstance(num_units, int):
      assert num_layers is not None
      num_units = [num_units] * (num_layers - 1) + [output_dims]
    else:
      num_units = list(num_units) + [output_dims]
    return DenseNet(
        num_units=num_units,
        activation=activation,
        activate_final=activate_final,
        name=name
    )
  elif net_type == "conv":
    if isinstance(conv_channels, int):
      assert num_layers is not None
      conv_channels = [conv_channels] * (num_layers - 1) + [output_dims]
    else:
      conv_channels = list(conv_channels) + [output_dims]
    return Conv2DNet(
        output_channels=conv_channels,
        kernel_shapes=kernel_shapes,
        strides=strides,
        padding=padding,
        activation=activation,
        activate_final=activate_final,
        name=name
    )
  elif net_type == "transformer":
    raise NotImplementedError()
  else:
    raise ValueError(f"Unrecognized net_type={net_type}.")
 def make_flexible_recurrent_net(
    core_type: str,
    net_type: str,
    output_dims: int,
    num_units: Union[Sequence[int], int],
    num_layers: Optional[int],
    activation: Activation,
    activate_final: bool = False,
    name: Optional[str] = None,
    **unused_kwargs
 ):
  """Commonly used for creating a flexible recurrences."""
  if net_type != "mlp":
    raise ValueError("We do not support convolutional recurrent nets atm.")
  if unused_kwargs:
    logging.warning("Unused kwargs of `make_flexible_recurrent_net`: %s",
                    str(unused_kwargs))
  if isinstance(num_units, (list, tuple)):
    num_units = list(num_units) + [output_dims]
    num_layers = len(num_units)
  else:
    assert num_layers is not None
    num_units = [num_units] * (num_layers - 1) + [output_dims]
  name = name or f"{core_type.upper()}"
  activation = utils.get_activation(activation)
  core_list = []
  for i, n in enumerate(num_units):
    if core_type.lower() == "vanilla":
      core_list.append(hk.VanillaRNN(hidden_size=n, name=f"{name}_{i}"))
    elif core_type.lower() == "lstm":
      core_list.append(hk.LSTM(hidden_size=n, name=f"{name}_{i}"))
    elif core_type.lower() == "gru":
      core_list.append(hk.GRU(hidden_size=n, name=f"{name}_{i}"))
    else:
      raise ValueError(f"Unrecognized core_type={core_type}.")
    if i != num_layers - 1:
      core_list.append(activation)
  if activate_final:
    core_list.append(activation)
  return hk.DeepRNN(core_list, name="RNN")
@@ -0,0 +1,10 @@
 git+https://github.com/deepmind/dm_hamiltonian_dynamics_suite@main#egg=dm_hamiltonian_dynamics_suite
 absl-py==0.12.0
 numpy>=1.16.4
 scikit-learn>=1.0
 typing>=3.7.4.3
 jax==0.2.20
 jaxline==0.0.3
 distrax==0.0.2
 optax==0.0.6
 dm-haiku==0.0.3
@@ -0,0 +1,54 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Setup for pip package."""
 from setuptools import setup
 REQUIRED_PACKAGES = (
    "dm_hamiltonian_dynamics_suite@git+https://github.com/deepmind/dm_hamiltonian_dynamics_suite",  # pylint: disable=line-too-long.
    "absl-py>=0.12.0",
    "numpy>=1.16.4",
    "scikit-learn>=1.0",
    "typing>=3.7.4.3",
    "jax==0.2.20",
    "jaxline==0.0.3",
    "distrax==0.0.2",
    "optax==0.0.6",
    "dm-haiku==0.0.3",
 )
 LONG_DESCRIPTION = "\n".join([
    "A codebase containing the implementation of the following models:",
    "Hamiltonian Generative Network (HGN)",
    "Lagrangian Generative Network (LGN)",
    "Neural ODE",
    "Recurrent Generative Network (RGN)",
    "and RNN, LSTM and GRU.",
    "This is code accompanying the publication of:"
 ])
 setup(
    name="physics_inspired_models",
    version="0.0.1",
    description="Implementation of multiple physically inspired models.",
    long_description=LONG_DESCRIPTION,
    url="https://github.com/deepmind/deepmind-research/physics_inspired_models",
    author="DeepMind",
    package_dir={"physics_inspired_models": "."},
    packages=["physics_inspired_models", "physics_inspired_models.models"],
    install_requires=REQUIRED_PACKAGES,
    platforms=["any"],
    license="Apache License, Version 2.0",
 )
@@ -0,0 +1,274 @@
 # Copyright 2020 DeepMind Technologies Limited.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utilities functions for Jax."""
 import collections
 import functools
 from typing import Any, Callable, Dict, Mapping, Union
 import distrax
 import jax
 from jax import core
 from jax import lax
 from jax import nn
 import jax.numpy as jnp
 from jax.tree_util import register_pytree_node
 from jaxline import utils
 import numpy as np
 HaikuParams = Mapping[str, Mapping[str, jnp.ndarray]]
 Params = Union[Mapping[str, jnp.ndarray], HaikuParams, jnp.ndarray]
 _Activation = Callable[[jnp.ndarray], jnp.ndarray]
 tf_leaky_relu = functools.partial(nn.leaky_relu, negative_slope=0.2)
 def filter_only_scalar_stats(stats):
  return {k: v for k, v in stats.items() if v.size == 1}
 def to_numpy(obj):
  return jax.tree_map(np.array, obj)
@jax.custom_gradient
 def geco_lagrange_product(lagrange_multiplier, constraint_ema, constraint_t):
  """Modifies the gradients so that they work as described in GECO.
  The evaluation gives:
    lagrange * C_ema
  The gradient w.r.t lagrange:
    - g * C_t
  The gradient w.r.t constraint_ema:
    0.0
  The gradient w.r.t constraint_t:
    g * lagrange
  Note that if you pass the same value for `constraint_ema` and `constraint_t`
  this would only flip the gradient for the lagrange multiplier.
  Args:
    lagrange_multiplier: The lagrange multiplier
    constraint_ema: The moving average of the constraint
    constraint_t: The current constraint
  Returns:
  """
  def grad(gradient):
    return (- gradient * constraint_t,
            jnp.zeros_like(constraint_ema),
            gradient * lagrange_multiplier)
  return lagrange_multiplier * constraint_ema, grad
 def bcast_if(x, t, n):
  return [x] * n if isinstance(x, t) else x
 def stack_time_into_channels(
    images: jnp.ndarray,
    data_format: str
 ) -> jnp.ndarray:
  axis = data_format.index("C")
  list_of_time = [jnp.squeeze(v, axis=1) for v in
                  jnp.split(images, images.shape[1], axis=1)]
  return jnp.concatenate(list_of_time, axis)
 def stack_device_dim_into_batch(obj):
  return jax.tree_map(lambda x: x.reshape((-1,) + x.shape[2:]), obj)
 def nearest_neighbour_upsampling(x, scale, data_format="NHWC"):
  """Performs nearest-neighbour upsampling."""
  if data_format == "NCHW":
    b, c, h, w = x.shape
    x = jnp.reshape(x, [b, c, h, 1, w, 1])
    ones = jnp.ones([1, 1, 1, scale, 1, scale], dtype=x.dtype)
    return jnp.reshape(x * ones, [b, c, scale * h, scale * w])
  elif data_format == "NHWC":
    b, h, w, c = x.shape
    x = jnp.reshape(x, [b, h, 1, w, 1, c])
    ones = jnp.ones([1, 1, scale, 1, scale, 1], dtype=x.dtype)
    return jnp.reshape(x * ones, [b, scale * h, scale * w, c])
  else:
    raise ValueError(f"Unrecognized data_format={data_format}.")
 def get_activation(arg: Union[_Activation, str]) -> _Activation:
  """Returns an activation from provided string."""
  if isinstance(arg, str):
    # Try fetch in order - [this module, jax.nn, jax.numpy]
    if arg in globals():
      return globals()[arg]
    if hasattr(nn, arg):
      return getattr(nn, arg)
    elif hasattr(jnp, arg):
      return getattr(jnp, arg)
    else:
      raise ValueError(f"Unrecognized activation with name {arg}.")
  if not callable(arg):
    raise ValueError(f"Expected a callable, but got {type(arg)}")
  return arg
 def merge_first_dims(x: jnp.ndarray, num_dims_to_merge: int = 2) -> jnp.ndarray:
  return x.reshape((-1,) + x.shape[num_dims_to_merge:])
 def extract_image(
    inputs: Union[jnp.ndarray, Mapping[str, jnp.ndarray]]
 ) -> jnp.ndarray:
  """Extracts a tensor with key `image` or `x_image` if it is a dict, otherwise returns the inputs."""
  if isinstance(inputs, dict):
    if "image" in inputs:
      return inputs["image"]
    else:
      return inputs["x_image"]
  elif isinstance(inputs, jnp.ndarray):
    return inputs
  raise NotImplementedError(f"Not implemented of inputs of type"
                            f" {type(inputs)}.")
 def extract_gt_state(inputs: Any) -> jnp.ndarray:
  if isinstance(inputs, dict):
    return inputs["x"]
  elif not isinstance(inputs, jnp.ndarray):
    raise NotImplementedError(f"Not implemented of inputs of type"
                              f" {type(inputs)}.")
  return inputs
 def reshape_latents_conv_to_flat(conv_latents, axis_n_to_keep=1):
  q, p = jnp.split(conv_latents, 2, axis=-1)
  q = jax.tree_map(lambda x: x.reshape(x.shape[:axis_n_to_keep] + (-1,)), q)
  p = jax.tree_map(lambda x: x.reshape(x.shape[:axis_n_to_keep] + (-1,)), p)
  flat_latents = jnp.concatenate([q, p], axis=-1)
  return flat_latents
 def triu_matrix_from_v(x, ndim):
  assert x.shape[-1] == (ndim * (ndim + 1)) // 2
  matrix = jnp.zeros(x.shape[:-1] + (ndim, ndim))
  idx = jnp.triu_indices(ndim)
  index_update = lambda x, idx, y: x.at[idx].set(y)
  for _ in range(x.ndim - 1):
    index_update = jax.vmap(index_update, in_axes=(0, None, 0))
  return index_update(matrix, idx, x)
 def flatten_dict(d, parent_key: str = "", sep: str = "_") -> Dict[str, Any]:
  items = []
  for k, v in d.items():
    new_key = parent_key + sep + k if parent_key else k
    if isinstance(v, collections.MutableMapping):
      items.extend(flatten_dict(v, new_key, sep=sep).items())
    else:
      items.append((new_key, v))
  return dict(items)
 def convert_to_pytype(target, reference):
  """Makes target the same pytype as reference, by jax.tree_flatten."""
  _, pytree = jax.tree_flatten(reference)
  leaves, _ = jax.tree_flatten(target)
  return jax.tree_unflatten(pytree, leaves)
 def func_if_not_scalar(func):
  """Makes a function that uses func only on non-scalar values."""
  @functools.wraps(func)
  def wrapped(array, axis=0):
    if array.ndim == 0:
      return array
    return func(array, axis=axis)
  return wrapped
 mean_if_not_scalar = func_if_not_scalar(jnp.mean)
 class MultiBatchAccumulator(object):
  """Class for abstracting statistics accumulation over multiple batches."""
  def __init__(self):
    self._obj = None
    self._obj_max = None
    self._obj_min = None
    self._num_samples = None
  def add(self, averaged_values, num_samples):
    """Adds an element to the moving average and the max."""
    if self._obj is None:
      self._obj_max = jax.tree_map(lambda y: y * 1.0, averaged_values)
      self._obj_min = jax.tree_map(lambda y: y * 1.0, averaged_values)
      self._obj = jax.tree_map(lambda y: y * num_samples, averaged_values)
      self._num_samples = num_samples
    else:
      self._obj_max = jax.tree_multimap(jnp.maximum, self._obj_max,
                                        averaged_values)
      self._obj_min = jax.tree_multimap(jnp.minimum, self._obj_min,
                                        averaged_values)
      self._obj = jax.tree_multimap(lambda x, y: x + y * num_samples, self._obj,
                                    averaged_values)
      self._num_samples += num_samples
  def value(self):
    return jax.tree_map(lambda x: x / self._num_samples, self._obj)
  def max(self):
    return jax.tree_map(float, self._obj_max)
  def min(self):
    return jax.tree_map(float, self._obj_min)
  def sum(self):
    return self._obj
 register_pytree_node(
    distrax.Normal,
    lambda instance: ([instance.loc, instance.scale], None),
    lambda _, args: distrax.Normal(*args)
 )
 def inner_product(x: Any, y: Any) -> jnp.ndarray:
  products = jax.tree_multimap(lambda x_, y_: jnp.sum(x_ * y_), x, y)
  return sum(jax.tree_leaves(products))
 get_first = utils.get_first
 bcast_local_devices = utils.bcast_local_devices
 py_prefetch = utils.py_prefetch
 p_split = jax.pmap(lambda x, num: list(jax.random.split(x, num)),
                   static_broadcasted_argnums=1)
 def wrap_if_pmap(p_func):
  def p_func_if_pmap(obj, axis_name):
    try:
      core.axis_frame(axis_name)
      return p_func(obj, axis_name)
    except NameError:
      return obj
  return p_func_if_pmap
 pmean_if_pmap = wrap_if_pmap(lax.pmean)
 psum_if_pmap = wrap_if_pmap(lax.psum)