mirror of
https://github.com/google-deepmind/deepmind-research.git
synced 2026-05-10 05:17:46 +08:00
afcdc77239
PiperOrigin-RevId: 299101887
265 lines
7.9 KiB
Python
265 lines
7.9 KiB
Python
# Lint as: python3
|
|
# Copyright 2019 Deepmind Technologies Limited.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Data loading functionality for IODINE."""
|
|
# pylint: disable=g-multiple-import, missing-docstring, unused-import
|
|
import os.path
|
|
|
|
from iodine.modules.utils import flatten_all_but_last, ensure_3d
|
|
from multi_object_datasets import (
|
|
clevr_with_masks,
|
|
multi_dsprites,
|
|
tetrominoes,
|
|
objects_room,
|
|
)
|
|
from shapeguard import ShapeGuard
|
|
import sonnet as snt
|
|
import tensorflow.compat.v1 as tf
|
|
|
|
|
|
class IODINEDataset(snt.AbstractModule):
|
|
num_true_objects = 1
|
|
num_channels = 3
|
|
|
|
factors = {}
|
|
|
|
def __init__(
|
|
self,
|
|
path,
|
|
batch_size,
|
|
image_dim,
|
|
crop_region=None,
|
|
shuffle_buffer=1000,
|
|
max_num_objects=None,
|
|
min_num_objects=None,
|
|
grayscale=False,
|
|
name="dataset",
|
|
**kwargs,
|
|
):
|
|
super().__init__(name=name)
|
|
self.path = os.path.abspath(os.path.expanduser(path))
|
|
self.batch_size = batch_size
|
|
self.crop_region = crop_region
|
|
self.image_dim = image_dim
|
|
self.shuffle_buffer = shuffle_buffer
|
|
self.max_num_objects = max_num_objects
|
|
self.min_num_objects = min_num_objects
|
|
self.grayscale = grayscale
|
|
self.dataset = None
|
|
|
|
def _build(self, subset="train"):
|
|
dataset = self.dataset
|
|
|
|
# filter by number of objects
|
|
if self.max_num_objects is not None or self.min_num_objects is not None:
|
|
dataset = self.dataset.filter(self.filter_by_num_objects)
|
|
|
|
if subset == "train":
|
|
# normal mode returns a shuffled dataset iterator
|
|
if self.shuffle_buffer is not None:
|
|
dataset = dataset.shuffle(self.shuffle_buffer)
|
|
elif subset == "summary":
|
|
# for generating summaries and overview images
|
|
# returns a single fixed batch
|
|
dataset = dataset.take(self.batch_size)
|
|
|
|
# repeat and batch
|
|
dataset = dataset.repeat().batch(self.batch_size, drop_remainder=True)
|
|
iterator = dataset.make_one_shot_iterator()
|
|
data = iterator.get_next()
|
|
|
|
# preprocess the data to ensure correct format, scale images etc.
|
|
data = self.preprocess(data)
|
|
return data
|
|
|
|
def filter_by_num_objects(self, d):
|
|
if "visibility" not in d:
|
|
return tf.constant(True)
|
|
min_num_objects = self.max_num_objects or 0
|
|
max_num_objects = self.max_num_objects or 6
|
|
|
|
min_predicate = tf.greater_equal(
|
|
tf.reduce_sum(d["visibility"]),
|
|
tf.constant(min_num_objects - 1e-5, dtype=tf.float32),
|
|
)
|
|
max_predicate = tf.less_equal(
|
|
tf.reduce_sum(d["visibility"]),
|
|
tf.constant(max_num_objects + 1e-5, dtype=tf.float32),
|
|
)
|
|
return tf.logical_and(min_predicate, max_predicate)
|
|
|
|
def preprocess(self, data):
|
|
sg = ShapeGuard(dims={
|
|
"B": self.batch_size,
|
|
"H": self.image_dim[0],
|
|
"W": self.image_dim[1]
|
|
})
|
|
image = sg.guard(data["image"], "B, h, w, C")
|
|
mask = sg.guard(data["mask"], "B, L, h, w, 1")
|
|
|
|
# to float
|
|
image = tf.cast(image, tf.float32) / 255.0
|
|
mask = tf.cast(mask, tf.float32) / 255.0
|
|
|
|
# crop
|
|
if self.crop_region is not None:
|
|
height_slice = slice(self.crop_region[0][0], self.crop_region[0][1])
|
|
width_slice = slice(self.crop_region[1][0], self.crop_region[1][1])
|
|
image = image[:, height_slice, width_slice, :]
|
|
|
|
mask = mask[:, :, height_slice, width_slice, :]
|
|
|
|
flat_mask, unflatten = flatten_all_but_last(mask, n_dims=3)
|
|
|
|
# rescale
|
|
size = tf.constant(
|
|
self.image_dim, dtype=tf.int32, shape=[2], verify_shape=True)
|
|
image = tf.image.resize_images(
|
|
image, size, method=tf.image.ResizeMethod.BILINEAR)
|
|
mask = tf.image.resize_images(
|
|
flat_mask, size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
|
|
|
|
if self.grayscale:
|
|
image = tf.reduce_mean(image, axis=-1, keepdims=True)
|
|
|
|
output = {
|
|
"image": sg.guard(image[:, None], "B, T, H, W, C"),
|
|
"mask": sg.guard(unflatten(mask)[:, None], "B, T, L, H, W, 1"),
|
|
"factors": self.preprocess_factors(data, sg),
|
|
}
|
|
|
|
if "visibility" in data:
|
|
output["visibility"] = sg.guard(data["visibility"], "B, L")
|
|
else:
|
|
output["visibility"] = tf.ones(sg["B, L"], dtype=tf.float32)
|
|
|
|
return output
|
|
|
|
def preprocess_factors(self, data, sg):
|
|
return {
|
|
name: sg.guard(ensure_3d(data[name]), "B, L, *")
|
|
for name in self.factors
|
|
}
|
|
|
|
def get_placeholders(self, batch_size=None):
|
|
batch_size = batch_size or self.batch_size
|
|
sg = ShapeGuard(
|
|
dims={
|
|
"B": batch_size,
|
|
"H": self.image_dim[0],
|
|
"W": self.image_dim[1],
|
|
"L": self.num_true_objects,
|
|
"C": 3,
|
|
"T": 1,
|
|
})
|
|
return {
|
|
"image": tf.placeholder(dtype=tf.float32, shape=sg["B, T, H, W, C"]),
|
|
"mask": tf.placeholder(dtype=tf.float32, shape=sg["B, T, L, H, W, 1"]),
|
|
"visibility": tf.placeholder(dtype=tf.float32, shape=sg["B, L"]),
|
|
"factors": {
|
|
name:
|
|
tf.placeholder(dtype=dtype, shape=sg["B, L, {}".format(size)])
|
|
for name, (dtype, size) in self.factors
|
|
},
|
|
}
|
|
|
|
|
|
class CLEVR(IODINEDataset):
|
|
num_true_objects = 11
|
|
num_channels = 3
|
|
factors = {
|
|
"color": (tf.uint8, 1),
|
|
"shape": (tf.uint8, 1),
|
|
"size": (tf.uint8, 1),
|
|
"position": (tf.float32, 3),
|
|
"rotation": (tf.float32, 1),
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
path,
|
|
crop_region=((29, 221), (64, 256)),
|
|
image_dim=(128, 128),
|
|
name="clevr",
|
|
**kwargs,
|
|
):
|
|
super().__init__(
|
|
path=path,
|
|
crop_region=crop_region,
|
|
image_dim=image_dim,
|
|
name=name,
|
|
**kwargs)
|
|
self.dataset = clevr_with_masks.dataset(self.path)
|
|
|
|
def preprocess_factors(self, data, sg):
|
|
|
|
return {
|
|
"color": sg.guard(ensure_3d(data["color"]), "B, L, 1"),
|
|
"shape": sg.guard(ensure_3d(data["shape"]), "B, L, 1"),
|
|
"size": sg.guard(ensure_3d(data["color"]), "B, L, 1"),
|
|
"position": sg.guard(ensure_3d(data["pixel_coords"]), "B, L, 3"),
|
|
"rotation": sg.guard(ensure_3d(data["rotation"]), "B, L, 1"),
|
|
}
|
|
|
|
|
|
class MultiDSprites(IODINEDataset):
|
|
num_true_objects = 6
|
|
num_channels = 3
|
|
factors = {
|
|
"color": (tf.float32, 3),
|
|
"shape": (tf.uint8, 1),
|
|
"scale": (tf.float32, 1),
|
|
"x": (tf.float32, 1),
|
|
"y": (tf.float32, 1),
|
|
"orientation": (tf.float32, 1),
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
path,
|
|
# variant from ['binarized', 'colored_on_grayscale', 'colored_on_colored']
|
|
dataset_variant="colored_on_grayscale",
|
|
image_dim=(64, 64),
|
|
name="multi_dsprites",
|
|
**kwargs,
|
|
):
|
|
super().__init__(path=path, name=name, image_dim=image_dim, **kwargs)
|
|
self.dataset_variant = dataset_variant
|
|
self.dataset = multi_dsprites.dataset(self.path, self.dataset_variant)
|
|
|
|
|
|
class Tetrominoes(IODINEDataset):
|
|
num_true_objects = 6
|
|
num_channels = 3
|
|
factors = {
|
|
"color": (tf.uint8, 3),
|
|
"shape": (tf.uint8, 1),
|
|
"position": (tf.float32, 2),
|
|
}
|
|
|
|
def __init__(self, path, image_dim=(35, 35), name="tetrominoes", **kwargs):
|
|
super().__init__(path=path, name=name, image_dim=image_dim, **kwargs)
|
|
self.dataset = tetrominoes.dataset(self.path)
|
|
|
|
def preprocess_factors(self, data, sg):
|
|
pos_x = ensure_3d(data["x"])
|
|
pos_y = ensure_3d(data["y"])
|
|
position = tf.concat([pos_x, pos_y], axis=2)
|
|
|
|
return {
|
|
"color": sg.guard(ensure_3d(data["color"]), "B, L, 3"),
|
|
"shape": sg.guard(ensure_3d(data["shape"]), "B, L, 1"),
|
|
"position": sg.guard(ensure_3d(position), "B, L, 2"),
|
|
}
|