commit 297b5bb62de3302ec35b40ae8ccffcc9343671e2 Author: Xiang Fu <> Date: Tue Jun 29 21:20:44 2021 -0400 initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f6a81cd --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.py[cod] +*.egg-info +./dist +MUJOCO_LOG.TXT \ No newline at end of file diff --git a/Dreamer/dmc2gym/__init__.py b/Dreamer/dmc2gym/__init__.py new file mode 100644 index 0000000..727957a --- /dev/null +++ b/Dreamer/dmc2gym/__init__.py @@ -0,0 +1,52 @@ +import gym +from gym.envs.registration import register + + +def make( + domain_name, + task_name, + resource_files, + img_source, + total_frames, + seed=1, + visualize_reward=True, + from_pixels=False, + height=84, + width=84, + camera_id=0, + frame_skip=1, + episode_length=1000, + environment_kwargs=None +): + env_id = 'dmc_%s_%s_%s-v1' % (domain_name, task_name, seed) + + if from_pixels: + assert not visualize_reward, 'cannot use visualize reward when learning from pixels' + + # shorten episode length + max_episode_steps = (episode_length + frame_skip - 1) // frame_skip + + if not env_id in gym.envs.registry.env_specs: + register( + id=env_id, + entry_point='dmc2gym.wrappers:DMCWrapper', + kwargs={ + 'domain_name': domain_name, + 'task_name': task_name, + 'resource_files': resource_files, + 'img_source': img_source, + 'total_frames': total_frames, + 'task_kwargs': { + 'random': seed + }, + 'environment_kwargs': environment_kwargs, + 'visualize_reward': visualize_reward, + 'from_pixels': from_pixels, + 'height': height, + 'width': width, + 'camera_id': camera_id, + 'frame_skip': frame_skip, + }, + # max_episode_steps=max_episode_steps + ) + return gym.make(env_id) diff --git a/Dreamer/dmc2gym/natural_imgsource.py b/Dreamer/dmc2gym/natural_imgsource.py new file mode 100644 index 0000000..42ef62f --- /dev/null +++ b/Dreamer/dmc2gym/natural_imgsource.py @@ -0,0 +1,183 @@ + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import cv2 +import skvideo.io +import random +import tqdm + +class BackgroundMatting(object): + """ + Produce a mask by masking the given color. This is a simple strategy + but effective for many games. + """ + def __init__(self, color): + """ + Args: + color: a (r, g, b) tuple or single value for grayscale + """ + self._color = color + + def get_mask(self, img): + return img == self._color + + +class ImageSource(object): + """ + Source of natural images to be added to a simulated environment. + """ + def get_image(self): + """ + Returns: + an RGB image of [h, w, 3] with a fixed shape. + """ + pass + + def reset(self): + """ Called when an episode ends. """ + pass + + +class FixedColorSource(ImageSource): + def __init__(self, shape, color): + """ + Args: + shape: [h, w] + color: a 3-tuple + """ + self.arr = np.zeros((shape[0], shape[1], 3)) + self.arr[:, :] = color + + def get_image(self): + return self.arr + + +class RandomColorSource(ImageSource): + def __init__(self, shape): + """ + Args: + shape: [h, w] + """ + self.shape = shape + self.arr = None + self.reset() + + def reset(self): + self._color = np.random.randint(0, 256, size=(3,)) + self.arr = np.zeros((self.shape[0], self.shape[1], 3)) + self.arr[:, :] = self._color + + def get_image(self): + return self.arr + + +class NoiseSource(ImageSource): + def __init__(self, shape, strength=255): + """ + Args: + shape: [h, w] + strength (int): the strength of noise, in range [0, 255] + """ + self.shape = shape + self.strength = strength + + def get_image(self): + return np.random.randn(self.shape[0], self.shape[1], 3) * self.strength + + +class RandomImageSource(ImageSource): + def __init__(self, shape, filelist, total_frames=None, grayscale=False): + """ + Args: + shape: [h, w] + filelist: a list of image files + """ + self.grayscale = grayscale + self.total_frames = total_frames + self.shape = shape + self.filelist = filelist + self.build_arr() + self.current_idx = 0 + self.reset() + + def build_arr(self): + self.total_frames = self.total_frames if self.total_frames else len(self.filelist) + self.arr = np.zeros((self.total_frames, self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,))) + for i in range(self.total_frames): + # if i % len(self.filelist) == 0: random.shuffle(self.filelist) + fname = self.filelist[i % len(self.filelist)] + if self.grayscale: im = cv2.imread(fname, cv2.IMREAD_GRAYSCALE)[..., None] + else: im = cv2.imread(fname, cv2.IMREAD_COLOR) + self.arr[i] = cv2.resize(im, (self.shape[1], self.shape[0])) ## THIS IS NOT A BUG! cv2 uses (width, height) + + def reset(self): + self._loc = np.random.randint(0, self.total_frames) + + def get_image(self): + return self.arr[self._loc] + + +class RandomVideoSource(ImageSource): + def __init__(self, shape, filelist, total_frames=None, grayscale=False): + """ + Args: + shape: [h, w] + filelist: a list of video files + """ + self.grayscale = grayscale + self.total_frames = total_frames + self.shape = shape + self.filelist = filelist + self.build_arr() + self.current_idx = 0 + self.reset() + + def build_arr(self): + if not self.total_frames: + self.total_frames = 0 + self.arr = None + random.shuffle(self.filelist) + for fname in tqdm.tqdm(self.filelist, desc="Loading videos for natural", position=0): + if self.grayscale: frames = skvideo.io.vread(fname, outputdict={"-pix_fmt": "gray"}) + else: frames = skvideo.io.vread(fname) + local_arr = np.zeros((frames.shape[0], self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,))) + for i in tqdm.tqdm(range(frames.shape[0]), desc="video frames", position=1): + local_arr[i] = cv2.resize(frames[i], (self.shape[1], self.shape[0])) ## THIS IS NOT A BUG! cv2 uses (width, height) + if self.arr is None: + self.arr = local_arr + else: + self.arr = np.concatenate([self.arr, local_arr], 0) + self.total_frames += local_arr.shape[0] + else: + self.arr = np.zeros((self.total_frames, self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,))) + total_frame_i = 0 + file_i = 0 + with tqdm.tqdm(total=self.total_frames, desc="Loading videos for natural") as pbar: + while total_frame_i < self.total_frames: + if file_i % len(self.filelist) == 0: random.shuffle(self.filelist) + file_i += 1 + fname = self.filelist[file_i % len(self.filelist)] + if self.grayscale: frames = skvideo.io.vread(fname, outputdict={"-pix_fmt": "gray"}) + else: frames = skvideo.io.vread(fname) + for frame_i in range(frames.shape[0]): + if total_frame_i >= self.total_frames: break + if self.grayscale: + self.arr[total_frame_i] = cv2.resize(frames[frame_i], (self.shape[1], self.shape[0]))[..., None] ## THIS IS NOT A BUG! cv2 uses (width, height) + else: + self.arr[total_frame_i] = cv2.resize(frames[frame_i], (self.shape[1], self.shape[0])) + pbar.update(1) + total_frame_i += 1 + + + def reset(self): + self._loc = np.random.randint(0, self.total_frames) + + def get_image(self): + img = self.arr[self._loc % self.total_frames] + self._loc += 1 + return img diff --git a/Dreamer/dmc2gym/wrappers.py b/Dreamer/dmc2gym/wrappers.py new file mode 100644 index 0000000..7968416 --- /dev/null +++ b/Dreamer/dmc2gym/wrappers.py @@ -0,0 +1,201 @@ +from gym import core, spaces +import glob +import os +import local_dm_control_suite as suite +from dm_env import specs +import numpy as np +import skimage.io + +from dmc2gym import natural_imgsource + + +def _spec_to_box(spec): + def extract_min_max(s): + assert s.dtype == np.float64 or s.dtype == np.float32 + dim = np.int(np.prod(s.shape)) + if type(s) == specs.Array: + bound = np.inf * np.ones(dim, dtype=np.float32) + return -bound, bound + elif type(s) == specs.BoundedArray: + zeros = np.zeros(dim, dtype=np.float32) + return s.minimum + zeros, s.maximum + zeros + + mins, maxs = [], [] + for s in spec: + mn, mx = extract_min_max(s) + mins.append(mn) + maxs.append(mx) + low = np.concatenate(mins, axis=0) + high = np.concatenate(maxs, axis=0) + assert low.shape == high.shape + return spaces.Box(low, high, dtype=np.float32) + + +def _flatten_obs(obs): + obs_pieces = [] + for v in obs.values(): + flat = np.array([v]) if np.isscalar(v) else v.ravel() + obs_pieces.append(flat) + return np.concatenate(obs_pieces, axis=0) + + +class DMCWrapper(core.Env): + def __init__( + self, + domain_name, + task_name, + resource_files, + img_source, + total_frames, + task_kwargs=None, + visualize_reward={}, + from_pixels=False, + height=84, + width=84, + camera_id=0, + frame_skip=1, + environment_kwargs=None + ): + assert 'random' in task_kwargs, 'please specify a seed, for deterministic behaviour' + self._from_pixels = from_pixels + self._height = height + self._width = width + self._camera_id = camera_id + self._frame_skip = frame_skip + self._img_source = img_source + self._resource_files = resource_files + + # create task + self._env = suite.load( + domain_name=domain_name, + task_name=task_name, + task_kwargs=task_kwargs, + visualize_reward=visualize_reward, + environment_kwargs=environment_kwargs + ) + + # true and normalized action spaces + self._true_action_space = _spec_to_box([self._env.action_spec()]) + self._norm_action_space = spaces.Box( + low=-1.0, + high=1.0, + shape=self._true_action_space.shape, + dtype=np.float32 + ) + + # create observation space + if from_pixels: + self._observation_space = spaces.Box( + low=0, high=255, shape=[3, height, width], dtype=np.uint8 + ) + else: + self._observation_space = _spec_to_box( + self._env.observation_spec().values() + ) + + self._internal_state_space = spaces.Box( + low=-np.inf, + high=np.inf, + shape=self._env.physics.get_state().shape, + dtype=np.float32 + ) + + # background + if img_source is not None: + shape2d = (height, width) + if img_source == "color": + self._bg_source = natural_imgsource.RandomColorSource(shape2d) + elif img_source == "noise": + self._bg_source = natural_imgsource.NoiseSource(shape2d) + else: + files = glob.glob(os.path.expanduser(resource_files)) + assert len(files), "Pattern {} does not match any files".format( + resource_files + ) + if img_source == "images": + self._bg_source = natural_imgsource.RandomImageSource(shape2d, files, grayscale=True, total_frames=total_frames) + elif img_source == "video": + self._bg_source = natural_imgsource.RandomVideoSource(shape2d, files, grayscale=True, total_frames=total_frames) + else: + raise Exception("img_source %s not defined." % img_source) + + # set seed + self.seed(seed=task_kwargs.get('random', 1)) + + def __getattr__(self, name): + return getattr(self._env, name) + + def _get_obs(self, time_step): + if self._from_pixels: + obs = self.render( + height=self._height, + width=self._width, + camera_id=self._camera_id + ) + if self._img_source is not None: + mask = np.logical_and((obs[:, :, 2] > obs[:, :, 1]), (obs[:, :, 2] > obs[:, :, 0])) # hardcoded for dmc + bg = self._bg_source.get_image() + obs[mask] = bg[mask] + # obs = obs.transpose(2, 0, 1).copy() + # CHW to HWC for tensorflow + obs = obs.copy() + else: + obs = _flatten_obs(time_step.observation) + return obs + + def _convert_action(self, action): + action = action.astype(np.float64) + true_delta = self._true_action_space.high - self._true_action_space.low + norm_delta = self._norm_action_space.high - self._norm_action_space.low + action = (action - self._norm_action_space.low) / norm_delta + action = action * true_delta + self._true_action_space.low + action = action.astype(np.float32) + return action + + @property + def observation_space(self): + return self._observation_space + + @property + def internal_state_space(self): + return self._internal_state_space + + @property + def action_space(self): + return self._norm_action_space + + def seed(self, seed): + self._true_action_space.seed(seed) + self._norm_action_space.seed(seed) + self._observation_space.seed(seed) + + def step(self, action): + assert self._norm_action_space.contains(action) + action = self._convert_action(action) + assert self._true_action_space.contains(action) + reward = 0 + extra = {'internal_state': self._env.physics.get_state().copy()} + + for _ in range(self._frame_skip): + time_step = self._env.step(action) + reward += time_step.reward or 0 + done = time_step.last() + if done: + break + obs = self._get_obs(time_step) + extra['discount'] = time_step.discount + return obs, reward, done, extra + + def reset(self): + time_step = self._env.reset() + obs = self._get_obs(time_step) + return obs + + def render(self, mode='rgb_array', height=None, width=None, camera_id=0): + assert mode == 'rgb_array', 'only support rgb_array mode, given %s' % mode + height = height or self._height + width = width or self._width + camera_id = camera_id or self._camera_id + return self._env.physics.render( + height=height, width=width, camera_id=camera_id + ) diff --git a/Dreamer/dreamers.py b/Dreamer/dreamers.py new file mode 100644 index 0000000..62b1b3a --- /dev/null +++ b/Dreamer/dreamers.py @@ -0,0 +1,739 @@ +import tools +import models +from tensorflow_probability import distributions as tfd +from tensorflow.keras.mixed_precision import experimental as prec +import tensorflow as tf +import numpy as np +import collections +import functools +import json +import time + +from env_tools import preprocess, count_steps + +def load_dataset(directory, config): + episode = next(tools.load_episodes(directory, 1)) + types = {k: v.dtype for k, v in episode.items()} + shapes = {k: (None,) + v.shape[1:] for k, v in episode.items()} + + def generator(): return tools.load_episodes( + directory, config.train_steps, config.batch_length, + config.dataset_balance) + dataset = tf.data.Dataset.from_generator(generator, types, shapes) + dataset = dataset.batch(config.batch_size, drop_remainder=True) + dataset = dataset.map(functools.partial(preprocess, config=config)) + dataset = dataset.prefetch(10) + return dataset + +class Dreamer(tools.Module): + + def __init__(self, config, datadir, actspace, writer): + self._c = config + self._actspace = actspace + self._actdim = actspace.n if hasattr( + actspace, 'n') else actspace.shape[0] + self._writer = writer + self._random = np.random.RandomState(config.seed) + self._should_pretrain = tools.Once() + self._should_train = tools.Every(config.train_every) + self._should_log = tools.Every(config.log_every) + self._last_log = None + self._last_time = time.time() + self._metrics = collections.defaultdict(tf.metrics.Mean) + self._metrics['expl_amount'] # Create variable for checkpoint. + self._float = prec.global_policy().compute_dtype + self._strategy = tf.distribute.MirroredStrategy() + with tf.device('cpu:0'): + self._step = tf.Variable(count_steps( + datadir, config), dtype=tf.int64) + with self._strategy.scope(): + self._dataset = iter(self._strategy.experimental_distribute_dataset( + load_dataset(datadir, self._c))) + self._build_model() + + def __call__(self, obs, reset, state=None, training=True): + step = self._step.numpy().item() + tf.summary.experimental.set_step(step) + if state is not None and reset.any(): + mask = tf.cast(1 - reset, self._float)[:, None] + state = tf.nest.map_structure(lambda x: x * mask, state) + if self._should_train(step): + log = self._should_log(step) + n = self._c.pretrain if self._should_pretrain() else self._c.train_steps + print(f'Training for {n} steps.') + with self._strategy.scope(): + for train_step in range(n): + log_images = self._c.log_images and log and train_step == 0 + self.train(next(self._dataset), log_images) + if log: + self._write_summaries() + action, state = self.policy(obs, state, training) + if training: + self._step.assign_add(len(reset) * self._c.action_repeat) + return action, state + + @tf.function + def policy(self, obs, state, training): + if state is None: + latent = self._dynamics.initial(len(obs['image'])) + action = tf.zeros((len(obs['image']), self._actdim), self._float) + else: + latent, action = state + embed = self._encode(preprocess(obs, self._c)) + latent, _ = self._dynamics.obs_step(latent, action, embed) + feat = self._dynamics.get_feat(latent) + if training: + action = self._actor(feat).sample() + else: + action = self._actor(feat).mode() + action = self._exploration(action, training) + state = (latent, action) + return action, state + + def load(self, filename): + super().load(filename) + self._should_pretrain() + + @tf.function() + def train(self, data, log_images=False): + self._strategy.experimental_run_v2( + self._train, args=(data, log_images)) + + def _train(self, data, log_images): + with tf.GradientTape() as model_tape: + embed = self._encode(data) + post, prior = self._dynamics.observe(embed, data['action']) + feat = self._dynamics.get_feat(post) + + image_pred = self._decode(feat) + reward_pred = self._reward(feat) + + likes = tools.AttrDict() + likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) + likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) + if self._c.pcont: + pcont_pred = self._pcont(feat) + pcont_target = self._c.discount * data['discount'] + likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) + likes.pcont *= self._c.pcont_scale + + prior_dist = self._dynamics.get_dist(prior) + post_dist = self._dynamics.get_dist(post) + div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) + div = tf.maximum(div, self._c.free_nats) + + model_loss = self._c.kl_scale * div - sum(likes.values()) + model_loss /= float(self._strategy.num_replicas_in_sync) + + with tf.GradientTape() as actor_tape: + imag_feat = self._imagine_ahead(post) + reward = self._reward(imag_feat).mode() + if self._c.pcont: + pcont = self._pcont(imag_feat).mean() + else: + pcont = self._c.discount * tf.ones_like(reward) + value = self._value(imag_feat).mode() + returns = tools.lambda_return( + reward[:-1], value[:-1], pcont[:-1], + bootstrap=value[-1], lambda_=self._c.disclam, axis=0) + discount = tf.stop_gradient(tf.math.cumprod(tf.concat( + [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) + actor_loss = -tf.reduce_mean(discount * returns) + actor_loss /= float(self._strategy.num_replicas_in_sync) + + with tf.GradientTape() as value_tape: + value_pred = self._value(imag_feat)[:-1] + target = tf.stop_gradient(returns) + value_loss = - \ + tf.reduce_mean(discount * value_pred.log_prob(target)) + value_loss /= float(self._strategy.num_replicas_in_sync) + + model_norm = self._model_opt(model_tape, model_loss) + actor_norm = self._actor_opt(actor_tape, actor_loss) + value_norm = self._value_opt(value_tape, value_loss) + + if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: + if self._c.log_scalars: + self._scalar_summaries( + data, feat, prior_dist, post_dist, likes, div, + model_loss, value_loss, actor_loss, model_norm, value_norm, + actor_norm) + if tf.equal(log_images, True): + self._image_summaries(data, embed, image_pred) + + def _build_model(self): + acts = dict( + elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish, + leaky_relu=tf.nn.leaky_relu) + cnn_act = acts[self._c.cnn_act] + act = acts[self._c.dense_act] + self._encode = models.ConvEncoder( + self._c.cnn_depth, cnn_act, self._c.image_size) + self._dynamics = models.RSSM( + self._c.stoch_size, self._c.deter_size, self._c.deter_size) + self._decode = models.ConvDecoder( + self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3)) + self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act) + if self._c.pcont: + self._pcont = models.DenseDecoder( + (), 3, self._c.num_units, 'binary', act=act) + self._value = models.DenseDecoder((), 3, self._c.num_units, act=act) + self._actor = models.ActionDecoder( + self._actdim, 4, self._c.num_units, self._c.action_dist, + init_std=self._c.action_init_std, act=act) + model_modules = [self._encode, self._dynamics, + self._decode, self._reward] + if self._c.pcont: + model_modules.append(self._pcont) + Optimizer = functools.partial( + tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip, + wdpattern=self._c.weight_decay_pattern) + self._model_opt = Optimizer('model', model_modules, self._c.model_lr) + self._value_opt = Optimizer('value', [self._value], self._c.value_lr) + self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr) + self.train(next(self._dataset)) + + def _exploration(self, action, training): + if training: + amount = self._c.expl_amount + if self._c.expl_decay: + amount *= 0.5 ** (tf.cast(self._step, + tf.float32) / self._c.expl_decay) + if self._c.expl_min: + amount = tf.maximum(self._c.expl_min, amount) + self._metrics['expl_amount'].update_state(amount) + elif self._c.eval_noise: + amount = self._c.eval_noise + else: + return action + if self._c.expl == 'additive_gaussian': + return tf.clip_by_value(tfd.Normal(action, amount).sample(), -1, 1) + if self._c.expl == 'completely_random': + return tf.random.uniform(action.shape, -1, 1) + if self._c.expl == 'epsilon_greedy': + indices = tfd.Categorical(0 * action).sample() + # pylint: disable=unexpected-keyword-arg, no-value-for-parameter + return tf.where( + tf.random.uniform(action.shape[:1], 0, 1) < amount, + tf.one_hot(indices, action.shape[-1], dtype=self._float), + action) + raise NotImplementedError(self._c.expl) + + def _imagine_ahead(self, post): + if self._c.pcont: # Last step could be terminal. + post = {k: v[:, :-1] for k, v in post.items()} + + def flatten(x): return tf.reshape(x, [-1] + list(x.shape[2:])) + start = {k: flatten(v) for k, v in post.items()} + + def policy(state): return self._actor( + tf.stop_gradient(self._dynamics.get_feat(state))).sample() + states = tools.static_scan( + lambda prev, _: self._dynamics.img_step(prev, policy(prev)), + tf.range(self._c.horizon), start) + imag_feat = self._dynamics.get_feat(states) + return imag_feat + + def _scalar_summaries( + self, data, feat, prior_dist, post_dist, likes, div, + model_loss, value_loss, actor_loss, model_norm, value_norm, + actor_norm): + self._metrics['model_grad_norm'].update_state(model_norm) + self._metrics['value_grad_norm'].update_state(value_norm) + self._metrics['actor_grad_norm'].update_state(actor_norm) + self._metrics['prior_ent'].update_state(prior_dist.entropy()) + self._metrics['post_ent'].update_state(post_dist.entropy()) + for name, logprob in likes.items(): + self._metrics[name + '_loss'].update_state(-logprob) + self._metrics['div'].update_state(div) + self._metrics['model_loss'].update_state(model_loss) + self._metrics['value_loss'].update_state(value_loss) + self._metrics['actor_loss'].update_state(actor_loss) + self._metrics['action_ent'].update_state(self._actor(feat).entropy()) + + def _image_summaries(self, data, embed, image_pred): + truth = data['image'][:6] + 0.5 + recon = image_pred.mode()[:6] + init, _ = self._dynamics.observe(embed[:6, :5], data['action'][:6, :5]) + init = {k: v[:, -1] for k, v in init.items()} + prior = self._dynamics.imagine(data['action'][:6, 5:], init) + openl = self._decode(self._dynamics.get_feat(prior)).mode() + model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1) + error = (model - truth + 1) / 2 + openl = tf.concat([truth, model, error], 2) + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'agent/openl', openl) + + def image_summary_from_data(self, data): + truth = data['image'][:6] + 0.5 + embed = self._encode(data) + post, _ = self._dynamics.observe( + embed[:6, :5], data['action'][:6, :5]) + feat = self._dynamics.get_feat(post) + init = {k: v[:, -1] for k, v in post.items()} + recon = self._decode(feat).mode()[:6] + prior = self._dynamics.imagine(data['action'][:6, 5:], init) + openl = self._decode(self._dynamics.get_feat(prior)).mode() + model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1) + error = (model - truth + 1) / 2 + openl = tf.concat([truth, model, error], 2) + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'agent/eval_openl', openl) + + def _write_summaries(self): + step = int(self._step.numpy()) + metrics = [(k, float(v.result())) for k, v in self._metrics.items()] + if self._last_log is not None: + duration = time.time() - self._last_time + self._last_time += duration + metrics.append(('fps', (step - self._last_log) / duration)) + self._last_log = step + [m.reset_states() for m in self._metrics.values()] + with (self._c.logdir / 'metrics.jsonl').open('a') as f: + f.write(json.dumps({'step': step, **dict(metrics)}) + '\n') + [tf.summary.scalar('agent/' + k, m) for k, m in metrics] + print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics)) + self._writer.flush() + + +class SeparationDreamer(Dreamer): + + def __init__(self, config, datadir, actspace, writer): + self._metrics_disen = collections.defaultdict(tf.metrics.Mean) + self._metrics_disen['expl_amount'] + super().__init__(config, datadir, actspace, writer) + + def _train(self, data, log_images): + with tf.GradientTape(persistent=True) as model_tape: + + # main + embed = self._encode(data) + post, prior = self._dynamics.observe(embed, data['action']) + feat = self._dynamics.get_feat(post) + + # disen + embed_disen = self._disen_encode(data) + post_disen, prior_disen = self._disen_dynamics.observe( + embed_disen, data['action']) + feat_disen = self._disen_dynamics.get_feat(post_disen) + + # disen image pred + image_pred_disen = self._disen_only_decode(feat_disen) + + # joint image pred + image_pred_joint, image_pred_joint_main, image_pred_joint_disen, mask_pred = self._joint_decode( + feat, feat_disen) + + # reward pred + reward_pred = self._reward(feat) + + # optimize disen reward predictor till optimal + for _ in range(self._c.num_reward_opt_iters): + with tf.GradientTape() as disen_reward_tape: + reward_pred_disen = self._disen_reward( + tf.stop_gradient(feat_disen)) + reward_like_disen = reward_pred_disen.log_prob( + data['reward']) + reward_loss_disen = -tf.reduce_mean(reward_like_disen) + reward_loss_disen /= float( + self._strategy.num_replicas_in_sync) + reward_disen_norm = self._disen_reward_opt( + disen_reward_tape, reward_loss_disen) + + # disen reward pred with optimal reward predictor + reward_pred_disen = self._disen_reward(feat_disen) + reward_like_disen = tf.reduce_mean( + reward_pred_disen.log_prob(data['reward'])) + + # main model loss + likes = tools.AttrDict() + likes.image = tf.reduce_mean( + image_pred_joint.log_prob(data['image'])) + likes.reward = tf.reduce_mean(reward_pred.log_prob( + data['reward'])) * self._c.reward_scale + if self._c.pcont: + pcont_pred = self._pcont(feat) + pcont_target = self._c.discount * data['discount'] + likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) + likes.pcont *= self._c.pcont_scale + + prior_dist = self._dynamics.get_dist(prior) + post_dist = self._dynamics.get_dist(post) + div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) + div = tf.maximum(div, self._c.free_nats) + + model_loss = self._c.kl_scale * div - sum(likes.values()) + model_loss /= float(self._strategy.num_replicas_in_sync) + + # disen model loss with reward negative gradient + likes_disen = tools.AttrDict() + likes_disen.image = tf.reduce_mean( + image_pred_joint.log_prob(data['image'])) + likes_disen.disen_only = tf.reduce_mean( + image_pred_disen.log_prob(data['image'])) + + reward_like_disen = reward_pred_disen.log_prob(data['reward']) + reward_like_disen = tf.reduce_mean(reward_like_disen) + reward_loss_disen = -reward_like_disen + + prior_dist_disen = self._disen_dynamics.get_dist(prior_disen) + post_dist_disen = self._disen_dynamics.get_dist(post_disen) + div_disen = tf.reduce_mean(tfd.kl_divergence( + post_dist_disen, prior_dist_disen)) + div_disen = tf.maximum(div_disen, self._c.free_nats) + + model_loss_disen = div_disen * self._c.disen_kl_scale + \ + reward_like_disen * self._c.disen_neg_rew_scale - \ + likes_disen.image - likes_disen.disen_only * self._c.disen_rec_scale + model_loss_disen /= float(self._strategy.num_replicas_in_sync) + + decode_loss = model_loss_disen + model_loss + + with tf.GradientTape() as actor_tape: + imag_feat = self._imagine_ahead(post) + reward = self._reward(imag_feat).mode() + if self._c.pcont: + pcont = self._pcont(imag_feat).mean() + else: + pcont = self._c.discount * tf.ones_like(reward) + value = self._value(imag_feat).mode() + returns = tools.lambda_return( + reward[:-1], value[:-1], pcont[:-1], + bootstrap=value[-1], lambda_=self._c.disclam, axis=0) + discount = tf.stop_gradient(tf.math.cumprod(tf.concat( + [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) + actor_loss = -tf.reduce_mean(discount * returns) + actor_loss /= float(self._strategy.num_replicas_in_sync) + + with tf.GradientTape() as value_tape: + value_pred = self._value(imag_feat)[:-1] + target = tf.stop_gradient(returns) + value_loss = - \ + tf.reduce_mean(discount * value_pred.log_prob(target)) + value_loss /= float(self._strategy.num_replicas_in_sync) + + model_norm = self._model_opt(model_tape, model_loss) + model_disen_norm = self._disen_opt(model_tape, model_loss_disen) + decode_norm = self._decode_opt(model_tape, decode_loss) + actor_norm = self._actor_opt(actor_tape, actor_loss) + value_norm = self._value_opt(value_tape, value_loss) + + if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: + if self._c.log_scalars: + self._scalar_summaries( + data, feat, prior_dist, post_dist, likes, div, + model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm) + self._scalar_summaries_disen( + prior_dist_disen, post_dist_disen, likes_disen, div_disen, + model_loss_disen, reward_loss_disen, + model_disen_norm, reward_disen_norm) + if tf.equal(log_images, True): + self._image_summaries_joint( + data, embed, embed_disen, image_pred_joint, mask_pred) + self._image_summaries( + self._disen_dynamics, self._disen_decode, data, embed_disen, image_pred_joint_disen, tag='disen/openl_joint_disen') + self._image_summaries( + self._disen_dynamics, self._disen_only_decode, data, embed_disen, image_pred_disen, tag='disen_only/openl_disen_only') + self._image_summaries( + self._dynamics, self._main_decode, data, embed, image_pred_joint_main, tag='main/openl_joint_main') + + def _build_model(self): + acts = dict( + elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish, + leaky_relu=tf.nn.leaky_relu) + cnn_act = acts[self._c.cnn_act] + act = acts[self._c.dense_act] + + # Distractor dynamic model + self._disen_encode = models.ConvEncoder( + self._c.disen_cnn_depth, cnn_act, self._c.image_size) + self._disen_dynamics = models.RSSM( + self._c.disen_stoch_size, self._c.disen_deter_size, self._c.disen_deter_size) + self._disen_only_decode = models.ConvDecoder( + self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3)) + self._disen_reward = models.DenseDecoder( + (), 2, self._c.num_units, act=act) + + # Task dynamic model + self._encode = models.ConvEncoder( + self._c.cnn_depth, cnn_act, self._c.image_size) + self._dynamics = models.RSSM( + self._c.stoch_size, self._c.deter_size, self._c.deter_size) + self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act) + if self._c.pcont: + self._pcont = models.DenseDecoder( + (), 3, self._c.num_units, 'binary', act=act) + self._value = models.DenseDecoder((), 3, self._c.num_units, act=act) + self._actor = models.ActionDecoder( + self._actdim, 4, self._c.num_units, self._c.action_dist, + init_std=self._c.action_init_std, act=act) + + # Joint decode + self._main_decode = models.ConvDecoderMask( + self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3)) + self._disen_decode = models.ConvDecoderMask( + self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3)) + self._joint_decode = models.ConvDecoderMaskEnsemble( + self._main_decode, self._disen_decode, self._c.precision + ) + + disen_modules = [self._disen_encode, + self._disen_dynamics, self._disen_only_decode] + model_modules = [self._encode, self._dynamics, self._reward] + if self._c.pcont: + model_modules.append(self._pcont) + + Optimizer = functools.partial( + tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip, + wdpattern=self._c.weight_decay_pattern) + self._model_opt = Optimizer('model', model_modules, self._c.model_lr) + self._disen_opt = Optimizer('disen', disen_modules, self._c.model_lr) + self._decode_opt = Optimizer( + 'decode', [self._joint_decode], self._c.model_lr) + self._disen_reward_opt = Optimizer( + 'disen_reward', [self._disen_reward], self._c.disen_reward_lr) + self._value_opt = Optimizer('value', [self._value], self._c.value_lr) + self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr) + self.train(next(self._dataset)) + + def _scalar_summaries_disen( + self, prior_dist_disen, post_dist_disen, likes_disen, div_disen, + model_loss_disen, reward_loss_disen, + model_disen_norm, reward_disen_norm): + self._metrics_disen['model_grad_norm'].update_state(model_disen_norm) + self._metrics_disen['reward_grad_norm'].update_state(reward_disen_norm) + self._metrics_disen['prior_ent'].update_state( + prior_dist_disen.entropy()) + self._metrics_disen['post_ent'].update_state(post_dist_disen.entropy()) + for name, logprob in likes_disen.items(): + self._metrics_disen[name + '_loss'].update_state(-logprob) + self._metrics_disen['div'].update_state(div_disen) + self._metrics_disen['model_loss'].update_state(model_loss_disen) + self._metrics_disen['reward_loss'].update_state( + reward_loss_disen) + + def _image_summaries(self, dynamics, decoder, data, embed, image_pred, tag='agent/openl'): + truth = data['image'][:6] + 0.5 + recon = image_pred.mode()[:6] + init, _ = dynamics.observe(embed[:6, :5], data['action'][:6, :5]) + init = {k: v[:, -1] for k, v in init.items()} + prior = dynamics.imagine(data['action'][:6, 5:], init) + if isinstance(decoder, models.ConvDecoderMask): + openl, _ = decoder(dynamics.get_feat(prior)) + openl = openl.mode() + else: + openl = decoder(dynamics.get_feat(prior)).mode() + model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1) + error = (model - truth + 1) / 2 + openl = tf.concat([truth, model, error], 2) + tools.graph_summary( + self._writer, tools.video_summary, self._step, tag, openl) + + def _image_summaries_joint(self, data, embed, embed_disen, image_pred_joint, mask_pred): + truth = data['image'][:6] + 0.5 + recon_joint = image_pred_joint.mode()[:6] + mask_pred = mask_pred[:6] + + init, _ = self._dynamics.observe( + embed[:6, :5], data['action'][:6, :5]) + init_disen, _ = self._disen_dynamics.observe( + embed_disen[:6, :5], data['action'][:6, :5]) + init = {k: v[:, -1] for k, v in init.items()} + init_disen = {k: v[:, -1] for k, v in init_disen.items()} + prior = self._dynamics.imagine( + data['action'][:6, 5:], init) + prior_disen = self._disen_dynamics.imagine( + data['action'][:6, 5:], init_disen) + + feat = self._dynamics.get_feat(prior) + feat_disen = self._disen_dynamics.get_feat(prior_disen) + openl, _, _, openl_mask = self._joint_decode(feat, feat_disen) + + openl = openl.mode() + model = tf.concat([recon_joint[:, :5] + 0.5, openl + 0.5], 1) + error = (model - truth + 1) / 2 + openl = tf.concat([truth, model, error], 2) + openl_mask = tf.concat([mask_pred[:, :5] + 0.5, openl_mask + 0.5], 1) + + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'joint/openl_joint', openl) + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'mask/openl_mask', openl_mask) + + def image_summary_from_data(self, data): + truth = data['image'][:6] + 0.5 + + # main + embed = self._encode(data) + post, _ = self._dynamics.observe( + embed[:6, :5], data['action'][:6, :5]) + feat = self._dynamics.get_feat(post) + init = {k: v[:, -1] for k, v in post.items()} + + # disen + embed_disen = self._disen_encode(data) + post_disen, _ = self._disen_dynamics.observe( + embed_disen[:6, :5], data['action'][:6, :5]) + feat_disen = self._disen_dynamics.get_feat(post_disen) + init_disen = {k: v[:, -1] for k, v in post_disen.items()} + + # joint image pred + recon_joint, recon_main, recon_disen, recon_mask = self._joint_decode( + feat, feat_disen) + recon_joint = recon_joint.mode()[:6] + recon_main = recon_main.mode()[:6] + recon_disen = recon_disen.mode()[:6] + recon_mask = recon_mask[:6] + + prior = self._dynamics.imagine( + data['action'][:6, 5:], init) + prior_disen = self._disen_dynamics.imagine( + data['action'][:6, 5:], init_disen) + feat = self._dynamics.get_feat(prior) + feat_disen = self._disen_dynamics.get_feat(prior_disen) + + openl_joint, openl_main, openl_disen, openl_mask = self._joint_decode( + feat, feat_disen) + openl_joint = openl_joint.mode() + openl_main = openl_main.mode() + openl_disen = openl_disen.mode() + + model_joint = tf.concat( + [recon_joint[:, :5] + 0.5, openl_joint + 0.5], 1) + error_joint = (model_joint - truth + 1) / 2 + model_main = tf.concat( + [recon_main[:, :5] + 0.5, openl_main + 0.5], 1) + model_disen = tf.concat( + [recon_disen[:, :5] + 0.5, openl_disen + 0.5], 1) + model_mask = tf.concat( + [recon_mask[:, :5] + 0.5, openl_mask + 0.5], 1) + + output_joint = tf.concat( + [truth, model_main, model_disen, model_joint, error_joint], 2) + output_mask = model_mask + + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'summary/openl', output_joint) + tools.graph_summary( + self._writer, tools.video_summary, self._step, 'summary/openl_mask', output_mask) + + def _write_summaries(self): + step = int(self._step.numpy()) + metrics = [(k, float(v.result())) for k, v in self._metrics.items()] + metrics_disen = [(k, float(v.result())) + for k, v in self._metrics_disen.items()] + if self._last_log is not None: + duration = time.time() - self._last_time + self._last_time += duration + metrics.append(('fps', (step - self._last_log) / duration)) + self._last_log = step + [m.reset_states() for m in self._metrics.values()] + [m.reset_states() for m in self._metrics_disen.values()] + with (self._c.logdir / 'metrics.jsonl').open('a') as f: + f.write(json.dumps({'step': step, **dict(metrics)}) + '\n') + [tf.summary.scalar('agent/' + k, m) for k, m in metrics] + [tf.summary.scalar('disen/' + k, m) for k, m in metrics_disen] + print('#'*30 + ' Main ' + '#'*30) + print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics)) + print('#'*30 + ' Disen ' + '#'*30) + print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics_disen)) + self._writer.flush() + +class InverseDreamer(Dreamer): + + def __init__(self, config, datadir, actspace, writer): + super().__init__(config, datadir, actspace, writer) + + def _train(self, data, log_images): + with tf.GradientTape() as model_tape: + embed = self._encode(data) + post, prior = self._dynamics.observe(embed, data['action']) + feat = self._dynamics.get_feat(post) + + action_pred = self._decode(feat) + reward_pred = self._reward(feat) + + likes = tools.AttrDict() + likes.action = tf.reduce_mean( + action_pred.log_prob(data['action'][:, :-1])) + likes.reward = tf.reduce_mean( + reward_pred.log_prob(data['reward'])) + if self._c.pcont: + pcont_pred = self._pcont(feat) + pcont_target = self._c.discount * data['discount'] + likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) + likes.pcont *= self._c.pcont_scale + + prior_dist = self._dynamics.get_dist(prior) + post_dist = self._dynamics.get_dist(post) + div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) + div = tf.maximum(div, self._c.free_nats) + + model_loss = self._c.kl_scale * div - sum(likes.values()) + model_loss /= float(self._strategy.num_replicas_in_sync) + + with tf.GradientTape() as actor_tape: + imag_feat = self._imagine_ahead(post) + reward = self._reward(imag_feat).mode() + if self._c.pcont: + pcont = self._pcont(imag_feat).mean() + else: + pcont = self._c.discount * tf.ones_like(reward) + value = self._value(imag_feat).mode() + returns = tools.lambda_return( + reward[:-1], value[:-1], pcont[:-1], + bootstrap=value[-1], lambda_=self._c.disclam, axis=0) + discount = tf.stop_gradient(tf.math.cumprod(tf.concat( + [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) + actor_loss = -tf.reduce_mean(discount * returns) + actor_loss /= float(self._strategy.num_replicas_in_sync) + + with tf.GradientTape() as value_tape: + value_pred = self._value(imag_feat)[:-1] + target = tf.stop_gradient(returns) + value_loss = - \ + tf.reduce_mean(discount * value_pred.log_prob(target)) + value_loss /= float(self._strategy.num_replicas_in_sync) + + model_norm = self._model_opt(model_tape, model_loss) + actor_norm = self._actor_opt(actor_tape, actor_loss) + value_norm = self._value_opt(value_tape, value_loss) + + if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: + if self._c.log_scalars: + self._scalar_summaries( + data, feat, prior_dist, post_dist, likes, div, + model_loss, value_loss, actor_loss, model_norm, value_norm, + actor_norm) + + def _build_model(self): + acts = dict( + elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish, + leaky_relu=tf.nn.leaky_relu) + cnn_act = acts[self._c.cnn_act] + act = acts[self._c.dense_act] + self._encode = models.ConvEncoder( + self._c.cnn_depth, cnn_act, self._c.image_size) + self._dynamics = models.RSSM( + self._c.stoch_size, self._c.deter_size, self._c.deter_size) + self._decode = models.InverseDecoder( + self._actdim, 4, self._c.num_units, act=act) + self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act) + if self._c.pcont: + self._pcont = models.DenseDecoder( + (), 3, self._c.num_units, 'binary', act=act) + self._value = models.DenseDecoder((), 3, self._c.num_units, act=act) + self._actor = models.ActionDecoder( + self._actdim, 4, self._c.num_units, self._c.action_dist, + init_std=self._c.action_init_std, act=act) + model_modules = [self._encode, self._dynamics, + self._decode, self._reward] + if self._c.pcont: + model_modules.append(self._pcont) + Optimizer = functools.partial( + tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip, + wdpattern=self._c.weight_decay_pattern) + self._model_opt = Optimizer('model', model_modules, self._c.model_lr) + self._value_opt = Optimizer('value', [self._value], self._c.value_lr) + self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr) + self.train(next(self._dataset)) \ No newline at end of file diff --git a/Dreamer/env_tools.py b/Dreamer/env_tools.py new file mode 100644 index 0000000..dd1d551 --- /dev/null +++ b/Dreamer/env_tools.py @@ -0,0 +1,84 @@ +import os +import json +import dmc2gym +import tensorflow as tf +from tensorflow.keras.mixed_precision import experimental as prec + +import tools +import wrappers + +def preprocess(obs, config): + dtype = prec.global_policy().compute_dtype + obs = obs.copy() + with tf.device('cpu:0'): + obs['image'] = tf.cast(obs['image'], dtype) / 255.0 - 0.5 + clip_rewards = dict(none=lambda x: x, tanh=tf.tanh)[ + config.clip_rewards] + obs['reward'] = clip_rewards(obs['reward']) + return obs + +def count_steps(datadir, config): + return tools.count_episodes(datadir)[1] * config.action_repeat + +def summarize_episode(episode, config, datadir, writer, prefix): + episodes, steps = tools.count_episodes(datadir) + length = (len(episode['reward']) - 1) * config.action_repeat + ret = episode['reward'].sum() + print(f'{prefix.title()} episode of length {length} with return {ret:.1f}.') + metrics = [ + (f'{prefix}/return', float(episode['reward'].sum())), + (f'{prefix}/length', len(episode['reward']) - 1), + ('episodes', episodes)] + + step = count_steps(datadir, config) + with (config.logdir / 'metrics.jsonl').open('a') as f: + f.write(json.dumps(dict([('step', step)] + metrics)) + '\n') + with writer.as_default(): # Env might run in a different thread. + tf.summary.experimental.set_step(step) + [tf.summary.scalar('sim/' + k, v) for k, v in metrics] + if prefix == 'test': + tools.video_summary(f'sim/{prefix}/video', episode['image'][None]) + +def make_env(config, writer, prefix, datadir, video_dir, store): + suite, domain_task_distractor = config.task.split('_', 1) + domain, task_distractor = domain_task_distractor.split('_', 1) + task, distractor = task_distractor.split('_', 1) + + if distractor == 'driving': + img_source = 'video' + total_frames = 1000 + resource_files = os.path.join(video_dir, '*.mp4') + elif distractor == 'noise': + img_source = 'noise' + total_frames = None + resource_files = None + elif distractor == 'none': + img_source = None + total_frames = None + resource_files = None + else: + raise NotImplementedError + + env = dmc2gym.make( + domain_name=domain, + task_name=task, + resource_files=resource_files, + img_source=img_source, + total_frames=total_frames, + seed=config.seed, + visualize_reward=False, + from_pixels=True, + height=config.image_size, + width=config.image_size, + frame_skip=config.action_repeat + ) + env = wrappers.DMC2GYMWrapper(env) + env = wrappers.TimeLimit(env, config.time_limit / config.action_repeat) + callbacks = [] + if store: + callbacks.append(lambda ep: tools.save_episodes(datadir, [ep])) + callbacks.append( + lambda ep: summarize_episode(ep, config, datadir, writer, prefix)) + env = wrappers.Collect(env, callbacks, config.precision) + env = wrappers.RewardObs(env) + return env diff --git a/Dreamer/local_dm_control_suite/README.md b/Dreamer/local_dm_control_suite/README.md new file mode 100755 index 0000000..135ab42 --- /dev/null +++ b/Dreamer/local_dm_control_suite/README.md @@ -0,0 +1,56 @@ +# DeepMind Control Suite. + +This submodule contains the domains and tasks described in the +[DeepMind Control Suite tech report](https://arxiv.org/abs/1801.00690). + +## Quickstart + +```python +from dm_control import suite +import numpy as np + +# Load one task: +env = suite.load(domain_name="cartpole", task_name="swingup") + +# Iterate over a task set: +for domain_name, task_name in suite.BENCHMARKING: + env = suite.load(domain_name, task_name) + +# Step through an episode and print out reward, discount and observation. +action_spec = env.action_spec() +time_step = env.reset() +while not time_step.last(): + action = np.random.uniform(action_spec.minimum, + action_spec.maximum, + size=action_spec.shape) + time_step = env.step(action) + print(time_step.reward, time_step.discount, time_step.observation) +``` + +## Illustration video + +Below is a video montage of solved Control Suite tasks, with reward +visualisation enabled. + +[![Video montage](https://img.youtube.com/vi/rAai4QzcYbs/0.jpg)](https://www.youtube.com/watch?v=rAai4QzcYbs) + + +### Quadruped domain [April 2019] + +Roughly based on the 'ant' model introduced by [Schulman et al. 2015](https://arxiv.org/abs/1506.02438). Main modifications to the body are: + +- 4 DoFs per leg, 1 constraining tendon. +- 3 actuators per leg: 'yaw', 'lift', 'extend'. +- Filtered position actuators with timescale of 100ms. +- Sensors include an IMU, force/torque sensors, and rangefinders. + +Four tasks: + +- `walk` and `run`: self-right the body then move forward at a desired speed. +- `escape`: escape a bowl-shaped random terrain (uses rangefinders). +- `fetch`, go to a moving ball and bring it to a target. + +All behaviors in the video below were trained with [Abdolmaleki et al's +MPO](https://arxiv.org/abs/1806.06920). + +[![Video montage](https://img.youtube.com/vi/RhRLjbb7pBE/0.jpg)](https://www.youtube.com/watch?v=RhRLjbb7pBE) diff --git a/Dreamer/local_dm_control_suite/__init__.py b/Dreamer/local_dm_control_suite/__init__.py new file mode 100755 index 0000000..c4d7cb9 --- /dev/null +++ b/Dreamer/local_dm_control_suite/__init__.py @@ -0,0 +1,151 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""A collection of MuJoCo-based Reinforcement Learning environments.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import inspect +import itertools + +from dm_control.rl import control + +from local_dm_control_suite import acrobot +from local_dm_control_suite import ball_in_cup +from local_dm_control_suite import cartpole +from local_dm_control_suite import cheetah +from local_dm_control_suite import finger +from local_dm_control_suite import fish +from local_dm_control_suite import hopper +from local_dm_control_suite import humanoid +from local_dm_control_suite import humanoid_CMU +from local_dm_control_suite import lqr +from local_dm_control_suite import manipulator +from local_dm_control_suite import pendulum +from local_dm_control_suite import point_mass +from local_dm_control_suite import quadruped +from local_dm_control_suite import reacher +from local_dm_control_suite import stacker +from local_dm_control_suite import swimmer +from local_dm_control_suite import walker + +# Find all domains imported. +_DOMAINS = {name: module for name, module in locals().items() + if inspect.ismodule(module) and hasattr(module, 'SUITE')} + + +def _get_tasks(tag): + """Returns a sequence of (domain name, task name) pairs for the given tag.""" + result = [] + + for domain_name in sorted(_DOMAINS.keys()): + domain = _DOMAINS[domain_name] + + if tag is None: + tasks_in_domain = domain.SUITE + else: + tasks_in_domain = domain.SUITE.tagged(tag) + + for task_name in tasks_in_domain.keys(): + result.append((domain_name, task_name)) + + return tuple(result) + + +def _get_tasks_by_domain(tasks): + """Returns a dict mapping from task name to a tuple of domain names.""" + result = collections.defaultdict(list) + + for domain_name, task_name in tasks: + result[domain_name].append(task_name) + + return {k: tuple(v) for k, v in result.items()} + + +# A sequence containing all (domain name, task name) pairs. +ALL_TASKS = _get_tasks(tag=None) + +# Subsets of ALL_TASKS, generated via the tag mechanism. +BENCHMARKING = _get_tasks('benchmarking') +EASY = _get_tasks('easy') +HARD = _get_tasks('hard') +EXTRA = tuple(sorted(set(ALL_TASKS) - set(BENCHMARKING))) + +# A mapping from each domain name to a sequence of its task names. +TASKS_BY_DOMAIN = _get_tasks_by_domain(ALL_TASKS) + + +def load(domain_name, task_name, task_kwargs=None, environment_kwargs=None, + visualize_reward=False): + """Returns an environment from a domain name, task name and optional settings. + + ```python + env = suite.load('cartpole', 'balance') + ``` + + Args: + domain_name: A string containing the name of a domain. + task_name: A string containing the name of a task. + task_kwargs: Optional `dict` of keyword arguments for the task. + environment_kwargs: Optional `dict` specifying keyword arguments for the + environment. + visualize_reward: Optional `bool`. If `True`, object colours in rendered + frames are set to indicate the reward at each step. Default `False`. + + Returns: + The requested environment. + """ + return build_environment(domain_name, task_name, task_kwargs, + environment_kwargs, visualize_reward) + + +def build_environment(domain_name, task_name, task_kwargs=None, + environment_kwargs=None, visualize_reward=False): + """Returns an environment from the suite given a domain name and a task name. + + Args: + domain_name: A string containing the name of a domain. + task_name: A string containing the name of a task. + task_kwargs: Optional `dict` specifying keyword arguments for the task. + environment_kwargs: Optional `dict` specifying keyword arguments for the + environment. + visualize_reward: Optional `bool`. If `True`, object colours in rendered + frames are set to indicate the reward at each step. Default `False`. + + Raises: + ValueError: If the domain or task doesn't exist. + + Returns: + An instance of the requested environment. + """ + if domain_name not in _DOMAINS: + raise ValueError('Domain {!r} does not exist.'.format(domain_name)) + + domain = _DOMAINS[domain_name] + + if task_name not in domain.SUITE: + raise ValueError('Level {!r} does not exist in domain {!r}.'.format( + task_name, domain_name)) + + task_kwargs = task_kwargs or {} + if environment_kwargs is not None: + task_kwargs = task_kwargs.copy() + task_kwargs['environment_kwargs'] = environment_kwargs + env = domain.SUITE[task_name](**task_kwargs) + env.task.visualize_reward = visualize_reward + return env diff --git a/Dreamer/local_dm_control_suite/acrobot.py b/Dreamer/local_dm_control_suite/acrobot.py new file mode 100755 index 0000000..a12b892 --- /dev/null +++ b/Dreamer/local_dm_control_suite/acrobot.py @@ -0,0 +1,127 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Acrobot domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + +_DEFAULT_TIME_LIMIT = 10 +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('acrobot.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns Acrobot balance task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(sparse=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add('benchmarking') +def swingup_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns Acrobot sparse balance.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(sparse=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Acrobot domain.""" + + def horizontal(self): + """Returns horizontal (x) component of body frame z-axes.""" + return self.named.data.xmat[['upper_arm', 'lower_arm'], 'xz'] + + def vertical(self): + """Returns vertical (z) component of body frame z-axes.""" + return self.named.data.xmat[['upper_arm', 'lower_arm'], 'zz'] + + def to_target(self): + """Returns the distance from the tip to the target.""" + tip_to_target = (self.named.data.site_xpos['target'] - + self.named.data.site_xpos['tip']) + return np.linalg.norm(tip_to_target) + + def orientations(self): + """Returns the sines and cosines of the pole angles.""" + return np.concatenate((self.horizontal(), self.vertical())) + + +class Balance(base.Task): + """An Acrobot `Task` to swing up and balance the pole.""" + + def __init__(self, sparse, random=None): + """Initializes an instance of `Balance`. + + Args: + sparse: A `bool` specifying whether to use a sparse (indicator) reward. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._sparse = sparse + super(Balance, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Shoulder and elbow are set to a random position between [-pi, pi). + + Args: + physics: An instance of `Physics`. + """ + physics.named.data.qpos[ + ['shoulder', 'elbow']] = self.random.uniform(-np.pi, np.pi, 2) + super(Balance, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of pole orientation and angular velocities.""" + obs = collections.OrderedDict() + obs['orientations'] = physics.orientations() + obs['velocity'] = physics.velocity() + return obs + + def _get_reward(self, physics, sparse): + target_radius = physics.named.model.site_size['target', 0] + return rewards.tolerance(physics.to_target(), + bounds=(0, target_radius), + margin=0 if sparse else 1) + + def get_reward(self, physics): + """Returns a sparse or a smooth reward, as specified in the constructor.""" + return self._get_reward(physics, sparse=self._sparse) diff --git a/Dreamer/local_dm_control_suite/acrobot.xml b/Dreamer/local_dm_control_suite/acrobot.xml new file mode 100755 index 0000000..79b76d9 --- /dev/null +++ b/Dreamer/local_dm_control_suite/acrobot.xml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/ball_in_cup.py b/Dreamer/local_dm_control_suite/ball_in_cup.py new file mode 100755 index 0000000..ac3e47f --- /dev/null +++ b/Dreamer/local_dm_control_suite/ball_in_cup.py @@ -0,0 +1,100 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Ball-in-Cup Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers + +_DEFAULT_TIME_LIMIT = 20 # (seconds) +_CONTROL_TIMESTEP = .02 # (seconds) + + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('ball_in_cup.xml'), common.ASSETS + + +@SUITE.add('benchmarking', 'easy') +def catch(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Ball-in-Cup task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = BallInCup(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics with additional features for the Ball-in-Cup domain.""" + + def ball_to_target(self): + """Returns the vector from the ball to the target.""" + target = self.named.data.site_xpos['target', ['x', 'z']] + ball = self.named.data.xpos['ball', ['x', 'z']] + return target - ball + + def in_target(self): + """Returns 1 if the ball is in the target, 0 otherwise.""" + ball_to_target = abs(self.ball_to_target()) + target_size = self.named.model.site_size['target', [0, 2]] + ball_size = self.named.model.geom_size['ball', 0] + return float(all(ball_to_target < target_size - ball_size)) + + +class BallInCup(base.Task): + """The Ball-in-Cup task. Put the ball in the cup.""" + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Args: + physics: An instance of `Physics`. + + """ + # Find a collision-free random initial position of the ball. + penetrating = True + while penetrating: + # Assign a random ball position. + physics.named.data.qpos['ball_x'] = self.random.uniform(-.2, .2) + physics.named.data.qpos['ball_z'] = self.random.uniform(.2, .5) + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + super(BallInCup, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the state.""" + obs = collections.OrderedDict() + obs['position'] = physics.position() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a sparse reward.""" + return physics.in_target() diff --git a/Dreamer/local_dm_control_suite/ball_in_cup.xml b/Dreamer/local_dm_control_suite/ball_in_cup.xml new file mode 100755 index 0000000..792073f --- /dev/null +++ b/Dreamer/local_dm_control_suite/ball_in_cup.xml @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/base.py b/Dreamer/local_dm_control_suite/base.py new file mode 100755 index 0000000..fd78318 --- /dev/null +++ b/Dreamer/local_dm_control_suite/base.py @@ -0,0 +1,112 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Base class for tasks in the Control Suite.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from dm_control import mujoco +from dm_control.rl import control + +import numpy as np + + +class Task(control.Task): + """Base class for tasks in the Control Suite. + + Actions are mapped directly to the states of MuJoCo actuators: each element of + the action array is used to set the control input for a single actuator. The + ordering of the actuators is the same as in the corresponding MJCF XML file. + + Attributes: + random: A `numpy.random.RandomState` instance. This should be used to + generate all random variables associated with the task, such as random + starting states, observation noise* etc. + + *If sensor noise is enabled in the MuJoCo model then this will be generated + using MuJoCo's internal RNG, which has its own independent state. + """ + + def __init__(self, random=None): + """Initializes a new continuous control task. + + Args: + random: Optional, either a `numpy.random.RandomState` instance, an integer + seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + if not isinstance(random, np.random.RandomState): + random = np.random.RandomState(random) + self._random = random + self._visualize_reward = False + + @property + def random(self): + """Task-specific `numpy.random.RandomState` instance.""" + return self._random + + def action_spec(self, physics): + """Returns a `BoundedArraySpec` matching the `physics` actuators.""" + return mujoco.action_spec(physics) + + def initialize_episode(self, physics): + """Resets geom colors to their defaults after starting a new episode. + + Subclasses of `base.Task` must delegate to this method after performing + their own initialization. + + Args: + physics: An instance of `mujoco.Physics`. + """ + self.after_step(physics) + + def before_step(self, action, physics): + """Sets the control signal for the actuators to values in `action`.""" + # Support legacy internal code. + action = getattr(action, "continuous_actions", action) + physics.set_control(action) + + def after_step(self, physics): + """Modifies colors according to the reward.""" + if self._visualize_reward: + reward = np.clip(self.get_reward(physics), 0.0, 1.0) + _set_reward_colors(physics, reward) + + @property + def visualize_reward(self): + return self._visualize_reward + + @visualize_reward.setter + def visualize_reward(self, value): + if not isinstance(value, bool): + raise ValueError("Expected a boolean, got {}.".format(type(value))) + self._visualize_reward = value + + +_MATERIALS = ["self", "effector", "target"] +_DEFAULT = [name + "_default" for name in _MATERIALS] +_HIGHLIGHT = [name + "_highlight" for name in _MATERIALS] + + +def _set_reward_colors(physics, reward): + """Sets the highlight, effector and target colors according to the reward.""" + assert 0.0 <= reward <= 1.0 + colors = physics.named.model.mat_rgba + default = colors[_DEFAULT] + highlight = colors[_HIGHLIGHT] + blend_coef = reward ** 4 # Better color distinction near high rewards. + colors[_MATERIALS] = blend_coef * highlight + (1.0 - blend_coef) * default diff --git a/Dreamer/local_dm_control_suite/cartpole.py b/Dreamer/local_dm_control_suite/cartpole.py new file mode 100755 index 0000000..b8fec14 --- /dev/null +++ b/Dreamer/local_dm_control_suite/cartpole.py @@ -0,0 +1,230 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Cartpole domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +from lxml import etree +import numpy as np +from six.moves import range + + +_DEFAULT_TIME_LIMIT = 10 +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(num_poles=1): + """Returns a tuple containing the model XML string and a dict of assets.""" + return _make_model(num_poles), common.ASSETS + + +@SUITE.add('benchmarking') +def balance(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Cartpole Balance task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(swing_up=False, sparse=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add('benchmarking') +def balance_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the sparse reward variant of the Cartpole Balance task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(swing_up=False, sparse=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add('benchmarking') +def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Cartpole Swing-Up task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(swing_up=True, sparse=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add('benchmarking') +def swingup_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the sparse reward variant of teh Cartpole Swing-Up task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Balance(swing_up=True, sparse=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add() +def two_poles(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Cartpole Balance task with two poles.""" + physics = Physics.from_xml_string(*get_model_and_assets(num_poles=2)) + task = Balance(swing_up=True, sparse=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add() +def three_poles(time_limit=_DEFAULT_TIME_LIMIT, random=None, num_poles=3, + sparse=False, environment_kwargs=None): + """Returns the Cartpole Balance task with three or more poles.""" + physics = Physics.from_xml_string(*get_model_and_assets(num_poles=num_poles)) + task = Balance(swing_up=True, sparse=sparse, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +def _make_model(n_poles): + """Generates an xml string defining a cart with `n_poles` bodies.""" + xml_string = common.read_model('cartpole.xml') + if n_poles == 1: + return xml_string + mjcf = etree.fromstring(xml_string) + parent = mjcf.find('./worldbody/body/body') # Find first pole. + # Make chain of poles. + for pole_index in range(2, n_poles+1): + child = etree.Element('body', name='pole_{}'.format(pole_index), + pos='0 0 1', childclass='pole') + etree.SubElement(child, 'joint', name='hinge_{}'.format(pole_index)) + etree.SubElement(child, 'geom', name='pole_{}'.format(pole_index)) + parent.append(child) + parent = child + # Move plane down. + floor = mjcf.find('./worldbody/geom') + floor.set('pos', '0 0 {}'.format(1 - n_poles - .05)) + # Move cameras back. + cameras = mjcf.findall('./worldbody/camera') + cameras[0].set('pos', '0 {} 1'.format(-1 - 2*n_poles)) + cameras[1].set('pos', '0 {} 2'.format(-2*n_poles)) + return etree.tostring(mjcf, pretty_print=True) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Cartpole domain.""" + + def cart_position(self): + """Returns the position of the cart.""" + return self.named.data.qpos['slider'][0] + + def angular_vel(self): + """Returns the angular velocity of the pole.""" + return self.data.qvel[1:] + + def pole_angle_cosine(self): + """Returns the cosine of the pole angle.""" + return self.named.data.xmat[2:, 'zz'] + + def bounded_position(self): + """Returns the state, with pole angle split into sin/cos.""" + return np.hstack((self.cart_position(), + self.named.data.xmat[2:, ['zz', 'xz']].ravel())) + + +class Balance(base.Task): + """A Cartpole `Task` to balance the pole. + + State is initialized either close to the target configuration or at a random + configuration. + """ + _CART_RANGE = (-.25, .25) + _ANGLE_COSINE_RANGE = (.995, 1) + + def __init__(self, swing_up, sparse, random=None): + """Initializes an instance of `Balance`. + + Args: + swing_up: A `bool`, which if `True` sets the cart to the middle of the + slider and the pole pointing towards the ground. Otherwise, sets the + cart to a random position on the slider and the pole to a random + near-vertical position. + sparse: A `bool`, whether to return a sparse or a smooth reward. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._sparse = sparse + self._swing_up = swing_up + super(Balance, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Initializes the cart and pole according to `swing_up`, and in both cases + adds a small random initial velocity to break symmetry. + + Args: + physics: An instance of `Physics`. + """ + nv = physics.model.nv + if self._swing_up: + physics.named.data.qpos['slider'] = .01*self.random.randn() + physics.named.data.qpos['hinge_1'] = np.pi + .01*self.random.randn() + physics.named.data.qpos[2:] = .1*self.random.randn(nv - 2) + else: + physics.named.data.qpos['slider'] = self.random.uniform(-.1, .1) + physics.named.data.qpos[1:] = self.random.uniform(-.034, .034, nv - 1) + physics.named.data.qvel[:] = 0.01 * self.random.randn(physics.model.nv) + super(Balance, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the (bounded) physics state.""" + obs = collections.OrderedDict() + obs['position'] = physics.bounded_position() + obs['velocity'] = physics.velocity() + return obs + + def _get_reward(self, physics, sparse): + if sparse: + cart_in_bounds = rewards.tolerance(physics.cart_position(), + self._CART_RANGE) + angle_in_bounds = rewards.tolerance(physics.pole_angle_cosine(), + self._ANGLE_COSINE_RANGE).prod() + return cart_in_bounds * angle_in_bounds + else: + upright = (physics.pole_angle_cosine() + 1) / 2 + centered = rewards.tolerance(physics.cart_position(), margin=2) + centered = (1 + centered) / 2 + small_control = rewards.tolerance(physics.control(), margin=1, + value_at_margin=0, + sigmoid='quadratic')[0] + small_control = (4 + small_control) / 5 + small_velocity = rewards.tolerance(physics.angular_vel(), margin=5).min() + small_velocity = (1 + small_velocity) / 2 + return upright.mean() * small_control * small_velocity * centered + + def get_reward(self, physics): + """Returns a sparse or a smooth reward, as specified in the constructor.""" + return self._get_reward(physics, sparse=self._sparse) diff --git a/Dreamer/local_dm_control_suite/cartpole.xml b/Dreamer/local_dm_control_suite/cartpole.xml new file mode 100755 index 0000000..e01869d --- /dev/null +++ b/Dreamer/local_dm_control_suite/cartpole.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/cheetah.py b/Dreamer/local_dm_control_suite/cheetah.py new file mode 100755 index 0000000..7dd2a63 --- /dev/null +++ b/Dreamer/local_dm_control_suite/cheetah.py @@ -0,0 +1,97 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Cheetah Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards + + +# How long the simulation will run, in seconds. +_DEFAULT_TIME_LIMIT = 10 + +# Running speed above which reward is 1. +_RUN_SPEED = 10 + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('cheetah.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the run task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Cheetah(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Cheetah domain.""" + + def speed(self): + """Returns the horizontal speed of the Cheetah.""" + return self.named.data.sensordata['torso_subtreelinvel'][0] + + +class Cheetah(base.Task): + """A `Task` to train a running Cheetah.""" + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + # The indexing below assumes that all joints have a single DOF. + assert physics.model.nq == physics.model.njnt + is_limited = physics.model.jnt_limited == 1 + lower, upper = physics.model.jnt_range[is_limited].T + physics.data.qpos[is_limited] = self.random.uniform(lower, upper) + + # Stabilize the model before the actual simulation. + for _ in range(200): + physics.step() + + physics.data.time = 0 + self._timeout_progress = 0 + super(Cheetah, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the state, ignoring horizontal position.""" + obs = collections.OrderedDict() + # Ignores horizontal position to maintain translational invariance. + obs['position'] = physics.data.qpos[1:].copy() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + return rewards.tolerance(physics.speed(), + bounds=(_RUN_SPEED, float('inf')), + margin=_RUN_SPEED, + value_at_margin=0, + sigmoid='linear') diff --git a/Dreamer/local_dm_control_suite/cheetah.xml b/Dreamer/local_dm_control_suite/cheetah.xml new file mode 100755 index 0000000..1952b5e --- /dev/null +++ b/Dreamer/local_dm_control_suite/cheetah.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/common/__init__.py b/Dreamer/local_dm_control_suite/common/__init__.py new file mode 100755 index 0000000..62eab26 --- /dev/null +++ b/Dreamer/local_dm_control_suite/common/__init__.py @@ -0,0 +1,39 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Functions to manage the common assets for domains.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from dm_control.utils import io as resources + +_SUITE_DIR = os.path.dirname(os.path.dirname(__file__)) +_FILENAMES = [ + "./common/materials.xml", + "./common/materials_white_floor.xml", + "./common/skybox.xml", + "./common/visual.xml", +] + +ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename)) + for filename in _FILENAMES} + + +def read_model(model_filename): + """Reads a model XML file and returns its contents as a string.""" + return resources.GetResource(os.path.join(_SUITE_DIR, model_filename)) diff --git a/Dreamer/local_dm_control_suite/common/materials.xml b/Dreamer/local_dm_control_suite/common/materials.xml new file mode 100755 index 0000000..5a3b169 --- /dev/null +++ b/Dreamer/local_dm_control_suite/common/materials.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/common/materials_white_floor.xml b/Dreamer/local_dm_control_suite/common/materials_white_floor.xml new file mode 100755 index 0000000..a1e35c2 --- /dev/null +++ b/Dreamer/local_dm_control_suite/common/materials_white_floor.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/common/skybox.xml b/Dreamer/local_dm_control_suite/common/skybox.xml new file mode 100755 index 0000000..b888692 --- /dev/null +++ b/Dreamer/local_dm_control_suite/common/skybox.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/Dreamer/local_dm_control_suite/common/visual.xml b/Dreamer/local_dm_control_suite/common/visual.xml new file mode 100755 index 0000000..ede15ad --- /dev/null +++ b/Dreamer/local_dm_control_suite/common/visual.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/Dreamer/local_dm_control_suite/demos/mocap_demo.py b/Dreamer/local_dm_control_suite/demos/mocap_demo.py new file mode 100755 index 0000000..2e2c7ca --- /dev/null +++ b/Dreamer/local_dm_control_suite/demos/mocap_demo.py @@ -0,0 +1,84 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Demonstration of amc parsing for CMU mocap database. + +To run the demo, supply a path to a `.amc` file: + + python mocap_demo --filename='path/to/mocap.amc' + +CMU motion capture clips are available at mocap.cs.cmu.edu +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +# Internal dependencies. + +from absl import app +from absl import flags + +from local_dm_control_suite import humanoid_CMU +from dm_control.suite.utils import parse_amc + +import matplotlib.pyplot as plt +import numpy as np + +FLAGS = flags.FLAGS +flags.DEFINE_string('filename', None, 'amc file to be converted.') +flags.DEFINE_integer('max_num_frames', 90, + 'Maximum number of frames for plotting/playback') + + +def main(unused_argv): + env = humanoid_CMU.stand() + + # Parse and convert specified clip. + converted = parse_amc.convert(FLAGS.filename, + env.physics, env.control_timestep()) + + max_frame = min(FLAGS.max_num_frames, converted.qpos.shape[1] - 1) + + width = 480 + height = 480 + video = np.zeros((max_frame, height, 2 * width, 3), dtype=np.uint8) + + for i in range(max_frame): + p_i = converted.qpos[:, i] + with env.physics.reset_context(): + env.physics.data.qpos[:] = p_i + video[i] = np.hstack([env.physics.render(height, width, camera_id=0), + env.physics.render(height, width, camera_id=1)]) + + tic = time.time() + for i in range(max_frame): + if i == 0: + img = plt.imshow(video[i]) + else: + img.set_data(video[i]) + toc = time.time() + clock_dt = toc - tic + tic = time.time() + # Real-time playback not always possible as clock_dt > .03 + plt.pause(max(0.01, 0.03 - clock_dt)) # Need min display time > 0.0. + plt.draw() + plt.waitforbuttonpress() + + +if __name__ == '__main__': + flags.mark_flag_as_required('filename') + app.run(main) diff --git a/Dreamer/local_dm_control_suite/demos/zeros.amc b/Dreamer/local_dm_control_suite/demos/zeros.amc new file mode 100755 index 0000000..b4590a4 --- /dev/null +++ b/Dreamer/local_dm_control_suite/demos/zeros.amc @@ -0,0 +1,213 @@ +#DUMMY AMC for testing +:FULLY-SPECIFIED +:DEGREES +1 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +2 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +3 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +4 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +5 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +6 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 +7 +root 0 0 0 0 0 0 +lowerback 0 0 0 +upperback 0 0 0 +thorax 0 0 0 +lowerneck 0 0 0 +upperneck 0 0 0 +head 0 0 0 +rclavicle 0 0 +rhumerus 0 0 0 +rradius 0 +rwrist 0 +rhand 0 0 +rfingers 0 +rthumb 0 0 +lclavicle 0 0 +lhumerus 0 0 0 +lradius 0 +lwrist 0 +lhand 0 0 +lfingers 0 +lthumb 0 0 +rfemur 0 0 0 +rtibia 0 +rfoot 0 0 +rtoes 0 +lfemur 0 0 0 +ltibia 0 +lfoot 0 0 +ltoes 0 diff --git a/Dreamer/local_dm_control_suite/explore.py b/Dreamer/local_dm_control_suite/explore.py new file mode 100755 index 0000000..06fb0a8 --- /dev/null +++ b/Dreamer/local_dm_control_suite/explore.py @@ -0,0 +1,84 @@ +# Copyright 2018 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Control suite environments explorer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app +from absl import flags +from dm_control import suite +from dm_control.suite.wrappers import action_noise +from six.moves import input + +from dm_control import viewer + + +_ALL_NAMES = ['.'.join(domain_task) for domain_task in suite.ALL_TASKS] + +flags.DEFINE_enum('environment_name', None, _ALL_NAMES, + 'Optional \'domain_name.task_name\' pair specifying the ' + 'environment to load. If unspecified a prompt will appear to ' + 'select one.') +flags.DEFINE_bool('timeout', True, 'Whether episodes should have a time limit.') +flags.DEFINE_bool('visualize_reward', True, + 'Whether to vary the colors of geoms according to the ' + 'current reward value.') +flags.DEFINE_float('action_noise', 0., + 'Standard deviation of Gaussian noise to apply to actions, ' + 'expressed as a fraction of the max-min range for each ' + 'action dimension. Defaults to 0, i.e. no noise.') +FLAGS = flags.FLAGS + + +def prompt_environment_name(prompt, values): + environment_name = None + while not environment_name: + environment_name = input(prompt) + if not environment_name or values.index(environment_name) < 0: + print('"%s" is not a valid environment name.' % environment_name) + environment_name = None + return environment_name + + +def main(argv): + del argv + environment_name = FLAGS.environment_name + if environment_name is None: + print('\n '.join(['Available environments:'] + _ALL_NAMES)) + environment_name = prompt_environment_name( + 'Please select an environment name: ', _ALL_NAMES) + + index = _ALL_NAMES.index(environment_name) + domain_name, task_name = suite.ALL_TASKS[index] + + task_kwargs = {} + if not FLAGS.timeout: + task_kwargs['time_limit'] = float('inf') + + def loader(): + env = suite.load( + domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs) + env.task.visualize_reward = FLAGS.visualize_reward + if FLAGS.action_noise > 0: + env = action_noise.Wrapper(env, scale=FLAGS.action_noise) + return env + + viewer.launch(loader) + + +if __name__ == '__main__': + app.run(main) diff --git a/Dreamer/local_dm_control_suite/finger.py b/Dreamer/local_dm_control_suite/finger.py new file mode 100755 index 0000000..e700db6 --- /dev/null +++ b/Dreamer/local_dm_control_suite/finger.py @@ -0,0 +1,217 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Finger Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +import numpy as np +from six.moves import range + +_DEFAULT_TIME_LIMIT = 20 # (seconds) +_CONTROL_TIMESTEP = .02 # (seconds) +# For TURN tasks, the 'tip' geom needs to enter a spherical target of sizes: +_EASY_TARGET_SIZE = 0.07 +_HARD_TARGET_SIZE = 0.03 +# Initial spin velocity for the Stop task. +_INITIAL_SPIN_VELOCITY = 100 +# Spinning slower than this value (radian/second) is considered stopped. +_STOP_VELOCITY = 1e-6 +# Spinning faster than this value (radian/second) is considered spinning. +_SPIN_VELOCITY = 15.0 + + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('finger.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def spin(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Spin task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Spin(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def turn_easy(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the easy Turn task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Turn(target_radius=_EASY_TARGET_SIZE, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def turn_hard(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the hard Turn task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Turn(target_radius=_HARD_TARGET_SIZE, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Finger domain.""" + + def touch(self): + """Returns logarithmically scaled signals from the two touch sensors.""" + return np.log1p(self.named.data.sensordata[['touchtop', 'touchbottom']]) + + def hinge_velocity(self): + """Returns the velocity of the hinge joint.""" + return self.named.data.sensordata['hinge_velocity'] + + def tip_position(self): + """Returns the (x,z) position of the tip relative to the hinge.""" + return (self.named.data.sensordata['tip'][[0, 2]] - + self.named.data.sensordata['spinner'][[0, 2]]) + + def bounded_position(self): + """Returns the positions, with the hinge angle replaced by tip position.""" + return np.hstack((self.named.data.sensordata[['proximal', 'distal']], + self.tip_position())) + + def velocity(self): + """Returns the velocities (extracted from sensordata).""" + return self.named.data.sensordata[['proximal_velocity', + 'distal_velocity', + 'hinge_velocity']] + + def target_position(self): + """Returns the (x,z) position of the target relative to the hinge.""" + return (self.named.data.sensordata['target'][[0, 2]] - + self.named.data.sensordata['spinner'][[0, 2]]) + + def to_target(self): + """Returns the vector from the tip to the target.""" + return self.target_position() - self.tip_position() + + def dist_to_target(self): + """Returns the signed distance to the target surface, negative is inside.""" + return (np.linalg.norm(self.to_target()) - + self.named.model.site_size['target', 0]) + + +class Spin(base.Task): + """A Finger `Task` to spin the stopped body.""" + + def __init__(self, random=None): + """Initializes a new `Spin` instance. + + Args: + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + super(Spin, self).__init__(random=random) + + def initialize_episode(self, physics): + physics.named.model.site_rgba['target', 3] = 0 + physics.named.model.site_rgba['tip', 3] = 0 + physics.named.model.dof_damping['hinge'] = .03 + _set_random_joint_angles(physics, self.random) + super(Spin, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns state and touch sensors, and target info.""" + obs = collections.OrderedDict() + obs['position'] = physics.bounded_position() + obs['velocity'] = physics.velocity() + obs['touch'] = physics.touch() + return obs + + def get_reward(self, physics): + """Returns a sparse reward.""" + return float(physics.hinge_velocity() <= -_SPIN_VELOCITY) + + +class Turn(base.Task): + """A Finger `Task` to turn the body to a target angle.""" + + def __init__(self, target_radius, random=None): + """Initializes a new `Turn` instance. + + Args: + target_radius: Radius of the target site, which specifies the goal angle. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._target_radius = target_radius + super(Turn, self).__init__(random=random) + + def initialize_episode(self, physics): + target_angle = self.random.uniform(-np.pi, np.pi) + hinge_x, hinge_z = physics.named.data.xanchor['hinge', ['x', 'z']] + radius = physics.named.model.geom_size['cap1'].sum() + target_x = hinge_x + radius * np.sin(target_angle) + target_z = hinge_z + radius * np.cos(target_angle) + physics.named.model.site_pos['target', ['x', 'z']] = target_x, target_z + physics.named.model.site_size['target', 0] = self._target_radius + + _set_random_joint_angles(physics, self.random) + + super(Turn, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns state, touch sensors, and target info.""" + obs = collections.OrderedDict() + obs['position'] = physics.bounded_position() + obs['velocity'] = physics.velocity() + obs['touch'] = physics.touch() + obs['target_position'] = physics.target_position() + obs['dist_to_target'] = physics.dist_to_target() + return obs + + def get_reward(self, physics): + return float(physics.dist_to_target() <= 0) + + +def _set_random_joint_angles(physics, random, max_attempts=1000): + """Sets the joints to a random collision-free state.""" + + for _ in range(max_attempts): + randomizers.randomize_limited_and_rotational_joints(physics, random) + # Check for collisions. + physics.after_reset() + if physics.data.ncon == 0: + break + else: + raise RuntimeError('Could not find a collision-free state ' + 'after {} attempts'.format(max_attempts)) diff --git a/Dreamer/local_dm_control_suite/finger.xml b/Dreamer/local_dm_control_suite/finger.xml new file mode 100755 index 0000000..3b35986 --- /dev/null +++ b/Dreamer/local_dm_control_suite/finger.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/fish.py b/Dreamer/local_dm_control_suite/fish.py new file mode 100755 index 0000000..3262def --- /dev/null +++ b/Dreamer/local_dm_control_suite/fish.py @@ -0,0 +1,176 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Fish Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + + +_DEFAULT_TIME_LIMIT = 40 +_CONTROL_TIMESTEP = .04 +_JOINTS = ['tail1', + 'tail_twist', + 'tail2', + 'finright_roll', + 'finright_pitch', + 'finleft_roll', + 'finleft_pitch'] +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('fish.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def upright(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Fish Upright task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Upright(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def swim(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Fish Swim task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Swim(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Fish domain.""" + + def upright(self): + """Returns projection from z-axes of torso to the z-axes of worldbody.""" + return self.named.data.xmat['torso', 'zz'] + + def torso_velocity(self): + """Returns velocities and angular velocities of the torso.""" + return self.data.sensordata + + def joint_velocities(self): + """Returns the joint velocities.""" + return self.named.data.qvel[_JOINTS] + + def joint_angles(self): + """Returns the joint positions.""" + return self.named.data.qpos[_JOINTS] + + def mouth_to_target(self): + """Returns a vector, from mouth to target in local coordinate of mouth.""" + data = self.named.data + mouth_to_target_global = data.geom_xpos['target'] - data.geom_xpos['mouth'] + return mouth_to_target_global.dot(data.geom_xmat['mouth'].reshape(3, 3)) + + +class Upright(base.Task): + """A Fish `Task` for getting the torso upright with smooth reward.""" + + def __init__(self, random=None): + """Initializes an instance of `Upright`. + + Args: + random: Either an existing `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically. + """ + super(Upright, self).__init__(random=random) + + def initialize_episode(self, physics): + """Randomizes the tail and fin angles and the orientation of the Fish.""" + quat = self.random.randn(4) + physics.named.data.qpos['root'][3:7] = quat / np.linalg.norm(quat) + for joint in _JOINTS: + physics.named.data.qpos[joint] = self.random.uniform(-.2, .2) + # Hide the target. It's irrelevant for this task. + physics.named.model.geom_rgba['target', 3] = 0 + super(Upright, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of joint angles, velocities and uprightness.""" + obs = collections.OrderedDict() + obs['joint_angles'] = physics.joint_angles() + obs['upright'] = physics.upright() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a smooth reward.""" + return rewards.tolerance(physics.upright(), bounds=(1, 1), margin=1) + + +class Swim(base.Task): + """A Fish `Task` for swimming with smooth reward.""" + + def __init__(self, random=None): + """Initializes an instance of `Swim`. + + Args: + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + super(Swim, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + + quat = self.random.randn(4) + physics.named.data.qpos['root'][3:7] = quat / np.linalg.norm(quat) + for joint in _JOINTS: + physics.named.data.qpos[joint] = self.random.uniform(-.2, .2) + # Randomize target position. + physics.named.model.geom_pos['target', 'x'] = self.random.uniform(-.4, .4) + physics.named.model.geom_pos['target', 'y'] = self.random.uniform(-.4, .4) + physics.named.model.geom_pos['target', 'z'] = self.random.uniform(.1, .3) + super(Swim, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of joints, target direction and velocities.""" + obs = collections.OrderedDict() + obs['joint_angles'] = physics.joint_angles() + obs['upright'] = physics.upright() + obs['target'] = physics.mouth_to_target() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a smooth reward.""" + radii = physics.named.model.geom_size[['mouth', 'target'], 0].sum() + in_target = rewards.tolerance(np.linalg.norm(physics.mouth_to_target()), + bounds=(0, radii), margin=2*radii) + is_upright = 0.5 * (physics.upright() + 1) + return (7*in_target + is_upright) / 8 diff --git a/Dreamer/local_dm_control_suite/fish.xml b/Dreamer/local_dm_control_suite/fish.xml new file mode 100755 index 0000000..43de56d --- /dev/null +++ b/Dreamer/local_dm_control_suite/fish.xml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/hopper.py b/Dreamer/local_dm_control_suite/hopper.py new file mode 100755 index 0000000..6458e41 --- /dev/null +++ b/Dreamer/local_dm_control_suite/hopper.py @@ -0,0 +1,138 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Hopper domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + + +SUITE = containers.TaggedTasks() + +_CONTROL_TIMESTEP = .02 # (Seconds) + +# Default duration of an episode, in seconds. +_DEFAULT_TIME_LIMIT = 20 + +# Minimal height of torso over foot above which stand reward is 1. +_STAND_HEIGHT = 0.6 + +# Hopping speed above which hop reward is 1. +_HOP_SPEED = 2 + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('hopper.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns a Hopper that strives to stand upright, balancing its pose.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Hopper(hopping=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def hop(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns a Hopper that strives to hop forward.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Hopper(hopping=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Hopper domain.""" + + def height(self): + """Returns height of torso with respect to foot.""" + return (self.named.data.xipos['torso', 'z'] - + self.named.data.xipos['foot', 'z']) + + def speed(self): + """Returns horizontal speed of the Hopper.""" + return self.named.data.sensordata['torso_subtreelinvel'][0] + + def touch(self): + """Returns the signals from two foot touch sensors.""" + return np.log1p(self.named.data.sensordata[['touch_toe', 'touch_heel']]) + + +class Hopper(base.Task): + """A Hopper's `Task` to train a standing and a jumping Hopper.""" + + def __init__(self, hopping, random=None): + """Initialize an instance of `Hopper`. + + Args: + hopping: Boolean, if True the task is to hop forwards, otherwise it is to + balance upright. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._hopping = hopping + super(Hopper, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + self._timeout_progress = 0 + super(Hopper, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of positions, velocities and touch sensors.""" + obs = collections.OrderedDict() + # Ignores horizontal position to maintain translational invariance: + obs['position'] = physics.data.qpos[1:].copy() + obs['velocity'] = physics.velocity() + obs['touch'] = physics.touch() + return obs + + def get_reward(self, physics): + """Returns a reward applicable to the performed task.""" + standing = rewards.tolerance(physics.height(), (_STAND_HEIGHT, 2)) + if self._hopping: + hopping = rewards.tolerance(physics.speed(), + bounds=(_HOP_SPEED, float('inf')), + margin=_HOP_SPEED/2, + value_at_margin=0.5, + sigmoid='linear') + return standing * hopping + else: + small_control = rewards.tolerance(physics.control(), + margin=1, value_at_margin=0, + sigmoid='quadratic').mean() + small_control = (small_control + 4) / 5 + return standing * small_control diff --git a/Dreamer/local_dm_control_suite/hopper.xml b/Dreamer/local_dm_control_suite/hopper.xml new file mode 100755 index 0000000..0c8ec28 --- /dev/null +++ b/Dreamer/local_dm_control_suite/hopper.xml @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/humanoid.py b/Dreamer/local_dm_control_suite/humanoid.py new file mode 100755 index 0000000..5a161f0 --- /dev/null +++ b/Dreamer/local_dm_control_suite/humanoid.py @@ -0,0 +1,211 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Humanoid Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + +_DEFAULT_TIME_LIMIT = 25 +_CONTROL_TIMESTEP = .025 + +# Height of head above which stand reward is 1. +_STAND_HEIGHT = 1.4 + +# Horizontal speeds above which move reward is 1. +_WALK_SPEED = 1 +_RUN_SPEED = 10 + + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('humanoid.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Stand task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Humanoid(move_speed=0, pure_state=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Walk task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Humanoid(move_speed=_WALK_SPEED, pure_state=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Run task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Humanoid(move_speed=_RUN_SPEED, pure_state=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add() +def run_pure_state(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Run task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Humanoid(move_speed=_RUN_SPEED, pure_state=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Walker domain.""" + + def torso_upright(self): + """Returns projection from z-axes of torso to the z-axes of world.""" + return self.named.data.xmat['torso', 'zz'] + + def head_height(self): + """Returns the height of the torso.""" + return self.named.data.xpos['head', 'z'] + + def center_of_mass_position(self): + """Returns position of the center-of-mass.""" + return self.named.data.subtree_com['torso'].copy() + + def center_of_mass_velocity(self): + """Returns the velocity of the center-of-mass.""" + return self.named.data.sensordata['torso_subtreelinvel'].copy() + + def torso_vertical_orientation(self): + """Returns the z-projection of the torso orientation matrix.""" + return self.named.data.xmat['torso', ['zx', 'zy', 'zz']] + + def joint_angles(self): + """Returns the state without global orientation or position.""" + return self.data.qpos[7:].copy() # Skip the 7 DoFs of the free root joint. + + def extremities(self): + """Returns end effector positions in egocentric frame.""" + torso_frame = self.named.data.xmat['torso'].reshape(3, 3) + torso_pos = self.named.data.xpos['torso'] + positions = [] + for side in ('left_', 'right_'): + for limb in ('hand', 'foot'): + torso_to_limb = self.named.data.xpos[side + limb] - torso_pos + positions.append(torso_to_limb.dot(torso_frame)) + return np.hstack(positions) + + +class Humanoid(base.Task): + """A humanoid task.""" + + def __init__(self, move_speed, pure_state, random=None): + """Initializes an instance of `Humanoid`. + + Args: + move_speed: A float. If this value is zero, reward is given simply for + standing up. Otherwise this specifies a target horizontal velocity for + the walking task. + pure_state: A bool. Whether the observations consist of the pure MuJoCo + state or includes some useful features thereof. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._move_speed = move_speed + self._pure_state = pure_state + super(Humanoid, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Args: + physics: An instance of `Physics`. + + """ + # Find a collision-free random initial configuration. + penetrating = True + while penetrating: + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + super(Humanoid, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns either the pure state or a set of egocentric features.""" + obs = collections.OrderedDict() + if self._pure_state: + obs['position'] = physics.position() + obs['velocity'] = physics.velocity() + else: + obs['joint_angles'] = physics.joint_angles() + obs['head_height'] = physics.head_height() + obs['extremities'] = physics.extremities() + obs['torso_vertical'] = physics.torso_vertical_orientation() + obs['com_velocity'] = physics.center_of_mass_velocity() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + standing = rewards.tolerance(physics.head_height(), + bounds=(_STAND_HEIGHT, float('inf')), + margin=_STAND_HEIGHT/4) + upright = rewards.tolerance(physics.torso_upright(), + bounds=(0.9, float('inf')), sigmoid='linear', + margin=1.9, value_at_margin=0) + stand_reward = standing * upright + small_control = rewards.tolerance(physics.control(), margin=1, + value_at_margin=0, + sigmoid='quadratic').mean() + small_control = (4 + small_control) / 5 + if self._move_speed == 0: + horizontal_velocity = physics.center_of_mass_velocity()[[0, 1]] + dont_move = rewards.tolerance(horizontal_velocity, margin=2).mean() + return small_control * stand_reward * dont_move + else: + com_velocity = np.linalg.norm(physics.center_of_mass_velocity()[[0, 1]]) + move = rewards.tolerance(com_velocity, + bounds=(self._move_speed, float('inf')), + margin=self._move_speed, value_at_margin=0, + sigmoid='linear') + move = (5*move + 1) / 6 + return small_control * stand_reward * move diff --git a/Dreamer/local_dm_control_suite/humanoid.xml b/Dreamer/local_dm_control_suite/humanoid.xml new file mode 100755 index 0000000..32b84c5 --- /dev/null +++ b/Dreamer/local_dm_control_suite/humanoid.xml @@ -0,0 +1,202 @@ + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/humanoid_CMU.py b/Dreamer/local_dm_control_suite/humanoid_CMU.py new file mode 100755 index 0000000..d06fb63 --- /dev/null +++ b/Dreamer/local_dm_control_suite/humanoid_CMU.py @@ -0,0 +1,179 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Humanoid_CMU Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + +_DEFAULT_TIME_LIMIT = 20 +_CONTROL_TIMESTEP = 0.02 + +# Height of head above which stand reward is 1. +_STAND_HEIGHT = 1.4 + +# Horizontal speeds above which move reward is 1. +_WALK_SPEED = 1 +_RUN_SPEED = 10 + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('humanoid_CMU.xml'), common.ASSETS + + +@SUITE.add() +def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Stand task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = HumanoidCMU(move_speed=0, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add() +def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Run task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = HumanoidCMU(move_speed=_RUN_SPEED, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the humanoid_CMU domain.""" + + def thorax_upright(self): + """Returns projection from y-axes of thorax to the z-axes of world.""" + return self.named.data.xmat['thorax', 'zy'] + + def head_height(self): + """Returns the height of the head.""" + return self.named.data.xpos['head', 'z'] + + def center_of_mass_position(self): + """Returns position of the center-of-mass.""" + return self.named.data.subtree_com['thorax'] + + def center_of_mass_velocity(self): + """Returns the velocity of the center-of-mass.""" + return self.named.data.sensordata['thorax_subtreelinvel'].copy() + + def torso_vertical_orientation(self): + """Returns the z-projection of the thorax orientation matrix.""" + return self.named.data.xmat['thorax', ['zx', 'zy', 'zz']] + + def joint_angles(self): + """Returns the state without global orientation or position.""" + return self.data.qpos[7:].copy() # Skip the 7 DoFs of the free root joint. + + def extremities(self): + """Returns end effector positions in egocentric frame.""" + torso_frame = self.named.data.xmat['thorax'].reshape(3, 3) + torso_pos = self.named.data.xpos['thorax'] + positions = [] + for side in ('l', 'r'): + for limb in ('hand', 'foot'): + torso_to_limb = self.named.data.xpos[side + limb] - torso_pos + positions.append(torso_to_limb.dot(torso_frame)) + return np.hstack(positions) + + +class HumanoidCMU(base.Task): + """A task for the CMU Humanoid.""" + + def __init__(self, move_speed, random=None): + """Initializes an instance of `Humanoid_CMU`. + + Args: + move_speed: A float. If this value is zero, reward is given simply for + standing up. Otherwise this specifies a target horizontal velocity for + the walking task. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._move_speed = move_speed + super(HumanoidCMU, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets a random collision-free configuration at the start of each episode. + + Args: + physics: An instance of `Physics`. + """ + penetrating = True + while penetrating: + randomizers.randomize_limited_and_rotational_joints( + physics, self.random) + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + super(HumanoidCMU, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns a set of egocentric features.""" + obs = collections.OrderedDict() + obs['joint_angles'] = physics.joint_angles() + obs['head_height'] = physics.head_height() + obs['extremities'] = physics.extremities() + obs['torso_vertical'] = physics.torso_vertical_orientation() + obs['com_velocity'] = physics.center_of_mass_velocity() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + standing = rewards.tolerance(physics.head_height(), + bounds=(_STAND_HEIGHT, float('inf')), + margin=_STAND_HEIGHT/4) + upright = rewards.tolerance(physics.thorax_upright(), + bounds=(0.9, float('inf')), sigmoid='linear', + margin=1.9, value_at_margin=0) + stand_reward = standing * upright + small_control = rewards.tolerance(physics.control(), margin=1, + value_at_margin=0, + sigmoid='quadratic').mean() + small_control = (4 + small_control) / 5 + if self._move_speed == 0: + horizontal_velocity = physics.center_of_mass_velocity()[[0, 1]] + dont_move = rewards.tolerance(horizontal_velocity, margin=2).mean() + return small_control * stand_reward * dont_move + else: + com_velocity = np.linalg.norm(physics.center_of_mass_velocity()[[0, 1]]) + move = rewards.tolerance(com_velocity, + bounds=(self._move_speed, float('inf')), + margin=self._move_speed, value_at_margin=0, + sigmoid='linear') + move = (5*move + 1) / 6 + return small_control * stand_reward * move diff --git a/Dreamer/local_dm_control_suite/humanoid_CMU.xml b/Dreamer/local_dm_control_suite/humanoid_CMU.xml new file mode 100755 index 0000000..9a41a16 --- /dev/null +++ b/Dreamer/local_dm_control_suite/humanoid_CMU.xml @@ -0,0 +1,289 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/lqr.py b/Dreamer/local_dm_control_suite/lqr.py new file mode 100755 index 0000000..34197b4 --- /dev/null +++ b/Dreamer/local_dm_control_suite/lqr.py @@ -0,0 +1,272 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Procedurally generated LQR domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import xml_tools +from lxml import etree +import numpy as np +from six.moves import range + +from dm_control.utils import io as resources + +_DEFAULT_TIME_LIMIT = float('inf') +_CONTROL_COST_COEF = 0.1 +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(n_bodies, n_actuators, random): + """Returns the model description as an XML string and a dict of assets. + + Args: + n_bodies: An int, number of bodies of the LQR. + n_actuators: An int, number of actuated bodies of the LQR. `n_actuators` + should be less or equal than `n_bodies`. + random: A `numpy.random.RandomState` instance. + + Returns: + A tuple `(model_xml_string, assets)`, where `assets` is a dict consisting of + `{filename: contents_string}` pairs. + """ + return _make_model(n_bodies, n_actuators, random), common.ASSETS + + +@SUITE.add() +def lqr_2_1(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns an LQR environment with 2 bodies of which the first is actuated.""" + return _make_lqr(n_bodies=2, + n_actuators=1, + control_cost_coef=_CONTROL_COST_COEF, + time_limit=time_limit, + random=random, + environment_kwargs=environment_kwargs) + + +@SUITE.add() +def lqr_6_2(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns an LQR environment with 6 bodies of which first 2 are actuated.""" + return _make_lqr(n_bodies=6, + n_actuators=2, + control_cost_coef=_CONTROL_COST_COEF, + time_limit=time_limit, + random=random, + environment_kwargs=environment_kwargs) + + +def _make_lqr(n_bodies, n_actuators, control_cost_coef, time_limit, random, + environment_kwargs): + """Returns a LQR environment. + + Args: + n_bodies: An int, number of bodies of the LQR. + n_actuators: An int, number of actuated bodies of the LQR. `n_actuators` + should be less or equal than `n_bodies`. + control_cost_coef: A number, the coefficient of the control cost. + time_limit: An int, maximum time for each episode in seconds. + random: Either an existing `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically. + environment_kwargs: A `dict` specifying keyword arguments for the + environment, or None. + + Returns: + A LQR environment with `n_bodies` bodies of which first `n_actuators` are + actuated. + """ + + if not isinstance(random, np.random.RandomState): + random = np.random.RandomState(random) + + model_string, assets = get_model_and_assets(n_bodies, n_actuators, + random=random) + physics = Physics.from_xml_string(model_string, assets=assets) + task = LQRLevel(control_cost_coef, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + **environment_kwargs) + + +def _make_body(body_id, stiffness_range, damping_range, random): + """Returns an `etree.Element` defining a body. + + Args: + body_id: Id of the created body. + stiffness_range: A tuple of (stiffness_lower_bound, stiffness_uppder_bound). + The stiffness of the joint is drawn uniformly from this range. + damping_range: A tuple of (damping_lower_bound, damping_upper_bound). The + damping of the joint is drawn uniformly from this range. + random: A `numpy.random.RandomState` instance. + + Returns: + A new instance of `etree.Element`. A body element with two children: joint + and geom. + """ + body_name = 'body_{}'.format(body_id) + joint_name = 'joint_{}'.format(body_id) + geom_name = 'geom_{}'.format(body_id) + + body = etree.Element('body', name=body_name) + body.set('pos', '.25 0 0') + joint = etree.SubElement(body, 'joint', name=joint_name) + body.append(etree.Element('geom', name=geom_name)) + joint.set('stiffness', + str(random.uniform(stiffness_range[0], stiffness_range[1]))) + joint.set('damping', + str(random.uniform(damping_range[0], damping_range[1]))) + return body + + +def _make_model(n_bodies, + n_actuators, + random, + stiffness_range=(15, 25), + damping_range=(0, 0)): + """Returns an MJCF XML string defining a model of springs and dampers. + + Args: + n_bodies: An integer, the number of bodies (DoFs) in the system. + n_actuators: An integer, the number of actuated bodies. + random: A `numpy.random.RandomState` instance. + stiffness_range: A tuple containing minimum and maximum stiffness. Each + joint's stiffness is sampled uniformly from this interval. + damping_range: A tuple containing minimum and maximum damping. Each joint's + damping is sampled uniformly from this interval. + + Returns: + An MJCF string describing the linear system. + + Raises: + ValueError: If the number of bodies or actuators is erronous. + """ + if n_bodies < 1 or n_actuators < 1: + raise ValueError('At least 1 body and 1 actuator required.') + if n_actuators > n_bodies: + raise ValueError('At most 1 actuator per body.') + + file_path = os.path.join(os.path.dirname(__file__), 'lqr.xml') + with resources.GetResourceAsFile(file_path) as xml_file: + mjcf = xml_tools.parse(xml_file) + parent = mjcf.find('./worldbody') + actuator = etree.SubElement(mjcf.getroot(), 'actuator') + tendon = etree.SubElement(mjcf.getroot(), 'tendon') + + for body in range(n_bodies): + # Inserting body. + child = _make_body(body, stiffness_range, damping_range, random) + site_name = 'site_{}'.format(body) + child.append(etree.Element('site', name=site_name)) + + if body == 0: + child.set('pos', '.25 0 .1') + # Add actuators to the first n_actuators bodies. + if body < n_actuators: + # Adding actuator. + joint_name = 'joint_{}'.format(body) + motor_name = 'motor_{}'.format(body) + child.find('joint').set('name', joint_name) + actuator.append(etree.Element('motor', name=motor_name, joint=joint_name)) + + # Add a tendon between consecutive bodies (for visualisation purposes only). + if body < n_bodies - 1: + child_site_name = 'site_{}'.format(body + 1) + tendon_name = 'tendon_{}'.format(body) + spatial = etree.SubElement(tendon, 'spatial', name=tendon_name) + spatial.append(etree.Element('site', site=site_name)) + spatial.append(etree.Element('site', site=child_site_name)) + parent.append(child) + parent = child + + return etree.tostring(mjcf, pretty_print=True) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the LQR domain.""" + + def state_norm(self): + """Returns the norm of the physics state.""" + return np.linalg.norm(self.state()) + + +class LQRLevel(base.Task): + """A Linear Quadratic Regulator `Task`.""" + + _TERMINAL_TOL = 1e-6 + + def __init__(self, control_cost_coef, random=None): + """Initializes an LQR level with cost = sum(states^2) + c*sum(controls^2). + + Args: + control_cost_coef: The coefficient of the control cost. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + + Raises: + ValueError: If the control cost coefficient is not positive. + """ + if control_cost_coef <= 0: + raise ValueError('control_cost_coef must be positive.') + + self._control_cost_coef = control_cost_coef + super(LQRLevel, self).__init__(random=random) + + @property + def control_cost_coef(self): + return self._control_cost_coef + + def initialize_episode(self, physics): + """Random state sampled from a unit sphere.""" + ndof = physics.model.nq + unit = self.random.randn(ndof) + physics.data.qpos[:] = np.sqrt(2) * unit / np.linalg.norm(unit) + super(LQRLevel, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the state.""" + obs = collections.OrderedDict() + obs['position'] = physics.position() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a quadratic state and control reward.""" + position = physics.position() + state_cost = 0.5 * np.dot(position, position) + control_signal = physics.control() + control_l2_norm = 0.5 * np.dot(control_signal, control_signal) + return 1 - (state_cost + control_l2_norm * self._control_cost_coef) + + def get_evaluation(self, physics): + """Returns a sparse evaluation reward that is not used for learning.""" + return float(physics.state_norm() <= 0.01) + + def get_termination(self, physics): + """Terminates when the state norm is smaller than epsilon.""" + if physics.state_norm() < self._TERMINAL_TOL: + return 0.0 diff --git a/Dreamer/local_dm_control_suite/lqr.xml b/Dreamer/local_dm_control_suite/lqr.xml new file mode 100755 index 0000000..d403532 --- /dev/null +++ b/Dreamer/local_dm_control_suite/lqr.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/lqr_solver.py b/Dreamer/local_dm_control_suite/lqr_solver.py new file mode 100755 index 0000000..3935c7d --- /dev/null +++ b/Dreamer/local_dm_control_suite/lqr_solver.py @@ -0,0 +1,142 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +r"""Optimal policy for LQR levels. + +LQR control problem is described in +https://en.wikipedia.org/wiki/Linear-quadratic_regulator#Infinite-horizon.2C_discrete-time_LQR +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import logging +from dm_control.mujoco import wrapper +import numpy as np +from six.moves import range + +try: + import scipy.linalg as sp # pylint: disable=g-import-not-at-top +except ImportError: + sp = None + + +def _solve_dare(a, b, q, r): + """Solves the Discrete-time Algebraic Riccati Equation (DARE) by iteration. + + Algebraic Riccati Equation: + ```none + P_{t-1} = Q + A' * P_{t} * A - + A' * P_{t} * B * (R + B' * P_{t} * B)^{-1} * B' * P_{t} * A + ``` + + Args: + a: A 2 dimensional numpy array, transition matrix A. + b: A 2 dimensional numpy array, control matrix B. + q: A 2 dimensional numpy array, symmetric positive definite cost matrix. + r: A 2 dimensional numpy array, symmetric positive definite cost matrix + + Returns: + A numpy array, a real symmetric matrix P which is the solution to DARE. + + Raises: + RuntimeError: If the computed P matrix is not symmetric and + positive-definite. + """ + p = np.eye(len(a)) + for _ in range(1000000): + a_p = a.T.dot(p) # A' * P_t + a_p_b = np.dot(a_p, b) # A' * P_t * B + # Algebraic Riccati Equation. + p_next = q + np.dot(a_p, a) - a_p_b.dot( + np.linalg.solve(b.T.dot(p.dot(b)) + r, a_p_b.T)) + p_next += p_next.T + p_next *= .5 + if np.abs(p - p_next).max() < 1e-12: + break + p = p_next + else: + logging.warning('DARE solver did not converge') + try: + # Check that the result is symmetric and positive-definite. + np.linalg.cholesky(p_next) + except np.linalg.LinAlgError: + raise RuntimeError('ARE solver failed: P matrix is not symmetric and ' + 'positive-definite.') + return p_next + + +def solve(env): + """Returns the optimal value and policy for LQR problem. + + Args: + env: An instance of `control.EnvironmentV2` with LQR level. + + Returns: + p: A numpy array, the Hessian of the optimal total cost-to-go (value + function at state x) is V(x) = .5 * x' * p * x. + k: A numpy array which gives the optimal linear policy u = k * x. + beta: The maximum eigenvalue of (a + b * k). Under optimal policy, at + timestep n the state tends to 0 like beta^n. + + Raises: + RuntimeError: If the controlled system is unstable. + """ + n = env.physics.model.nq # number of DoFs + m = env.physics.model.nu # number of controls + + # Compute the mass matrix. + mass = np.zeros((n, n)) + wrapper.mjbindings.mjlib.mj_fullM(env.physics.model.ptr, mass, + env.physics.data.qM) + + # Compute input matrices a, b, q and r to the DARE solvers. + # State transition matrix a. + stiffness = np.diag(env.physics.model.jnt_stiffness.ravel()) + damping = np.diag(env.physics.model.dof_damping.ravel()) + dt = env.physics.model.opt.timestep + + j = np.linalg.solve(-mass, np.hstack((stiffness, damping))) + a = np.eye(2 * n) + dt * np.vstack( + (dt * j + np.hstack((np.zeros((n, n)), np.eye(n))), j)) + + # Control transition matrix b. + b = env.physics.data.actuator_moment.T + bc = np.linalg.solve(mass, b) + b = dt * np.vstack((dt * bc, bc)) + + # State cost Hessian q. + q = np.diag(np.hstack([np.ones(n), np.zeros(n)])) + + # Control cost Hessian r. + r = env.task.control_cost_coef * np.eye(m) + + if sp: + # Use scipy's faster DARE solver if available. + solve_dare = sp.solve_discrete_are + else: + # Otherwise fall back on a slower internal implementation. + solve_dare = _solve_dare + + # Solve the discrete algebraic Riccati equation. + p = solve_dare(a, b, q, r) + k = -np.linalg.solve(b.T.dot(p.dot(b)) + r, b.T.dot(p.dot(a))) + + # Under optimal policy, state tends to 0 like beta^n_timesteps + beta = np.abs(np.linalg.eigvals(a + b.dot(k))).max() + if beta >= 1.0: + raise RuntimeError('Controlled system is unstable.') + return p, k, beta diff --git a/Dreamer/local_dm_control_suite/manipulator.py b/Dreamer/local_dm_control_suite/manipulator.py new file mode 100755 index 0000000..b2ed31f --- /dev/null +++ b/Dreamer/local_dm_control_suite/manipulator.py @@ -0,0 +1,290 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Planar Manipulator domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +from dm_control.utils import xml_tools + +from lxml import etree +import numpy as np + +_CLOSE = .01 # (Meters) Distance below which a thing is considered close. +_CONTROL_TIMESTEP = .01 # (Seconds) +_TIME_LIMIT = 10 # (Seconds) +_P_IN_HAND = .1 # Probabillity of object-in-hand initial state +_P_IN_TARGET = .1 # Probabillity of object-in-target initial state +_ARM_JOINTS = ['arm_root', 'arm_shoulder', 'arm_elbow', 'arm_wrist', + 'finger', 'fingertip', 'thumb', 'thumbtip'] +_ALL_PROPS = frozenset(['ball', 'target_ball', 'cup', + 'peg', 'target_peg', 'slot']) + +SUITE = containers.TaggedTasks() + + +def make_model(use_peg, insert): + """Returns a tuple containing the model XML string and a dict of assets.""" + xml_string = common.read_model('manipulator.xml') + parser = etree.XMLParser(remove_blank_text=True) + mjcf = etree.XML(xml_string, parser) + + # Select the desired prop. + if use_peg: + required_props = ['peg', 'target_peg'] + if insert: + required_props += ['slot'] + else: + required_props = ['ball', 'target_ball'] + if insert: + required_props += ['cup'] + + # Remove unused props + for unused_prop in _ALL_PROPS.difference(required_props): + prop = xml_tools.find_element(mjcf, 'body', unused_prop) + prop.getparent().remove(prop) + + return etree.tostring(mjcf, pretty_print=True), common.ASSETS + + +@SUITE.add('benchmarking', 'hard') +def bring_ball(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns manipulator bring task with the ball prop.""" + use_peg = False + insert = False + physics = Physics.from_xml_string(*make_model(use_peg, insert)) + task = Bring(use_peg=use_peg, insert=insert, + fully_observable=fully_observable, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +@SUITE.add('hard') +def bring_peg(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns manipulator bring task with the peg prop.""" + use_peg = True + insert = False + physics = Physics.from_xml_string(*make_model(use_peg, insert)) + task = Bring(use_peg=use_peg, insert=insert, + fully_observable=fully_observable, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +@SUITE.add('hard') +def insert_ball(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns manipulator insert task with the ball prop.""" + use_peg = False + insert = True + physics = Physics.from_xml_string(*make_model(use_peg, insert)) + task = Bring(use_peg=use_peg, insert=insert, + fully_observable=fully_observable, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +@SUITE.add('hard') +def insert_peg(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns manipulator insert task with the peg prop.""" + use_peg = True + insert = True + physics = Physics.from_xml_string(*make_model(use_peg, insert)) + task = Bring(use_peg=use_peg, insert=insert, + fully_observable=fully_observable, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics with additional features for the Planar Manipulator domain.""" + + def bounded_joint_pos(self, joint_names): + """Returns joint positions as (sin, cos) values.""" + joint_pos = self.named.data.qpos[joint_names] + return np.vstack([np.sin(joint_pos), np.cos(joint_pos)]).T + + def joint_vel(self, joint_names): + """Returns joint velocities.""" + return self.named.data.qvel[joint_names] + + def body_2d_pose(self, body_names, orientation=True): + """Returns positions and/or orientations of bodies.""" + if not isinstance(body_names, str): + body_names = np.array(body_names).reshape(-1, 1) # Broadcast indices. + pos = self.named.data.xpos[body_names, ['x', 'z']] + if orientation: + ori = self.named.data.xquat[body_names, ['qw', 'qy']] + return np.hstack([pos, ori]) + else: + return pos + + def touch(self): + return np.log1p(self.data.sensordata) + + def site_distance(self, site1, site2): + site1_to_site2 = np.diff(self.named.data.site_xpos[[site2, site1]], axis=0) + return np.linalg.norm(site1_to_site2) + + +class Bring(base.Task): + """A Bring `Task`: bring the prop to the target.""" + + def __init__(self, use_peg, insert, fully_observable, random=None): + """Initialize an instance of the `Bring` task. + + Args: + use_peg: A `bool`, whether to replace the ball prop with the peg prop. + insert: A `bool`, whether to insert the prop in a receptacle. + fully_observable: A `bool`, whether the observation should contain the + position and velocity of the object being manipulated and the target + location. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._use_peg = use_peg + self._target = 'target_peg' if use_peg else 'target_ball' + self._object = 'peg' if self._use_peg else 'ball' + self._object_joints = ['_'.join([self._object, dim]) for dim in 'xzy'] + self._receptacle = 'slot' if self._use_peg else 'cup' + self._insert = insert + self._fully_observable = fully_observable + super(Bring, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + # Local aliases + choice = self.random.choice + uniform = self.random.uniform + model = physics.named.model + data = physics.named.data + + # Find a collision-free random initial configuration. + penetrating = True + while penetrating: + + # Randomise angles of arm joints. + is_limited = model.jnt_limited[_ARM_JOINTS].astype(np.bool) + joint_range = model.jnt_range[_ARM_JOINTS] + lower_limits = np.where(is_limited, joint_range[:, 0], -np.pi) + upper_limits = np.where(is_limited, joint_range[:, 1], np.pi) + angles = uniform(lower_limits, upper_limits) + data.qpos[_ARM_JOINTS] = angles + + # Symmetrize hand. + data.qpos['finger'] = data.qpos['thumb'] + + # Randomise target location. + target_x = uniform(-.4, .4) + target_z = uniform(.1, .4) + if self._insert: + target_angle = uniform(-np.pi/3, np.pi/3) + model.body_pos[self._receptacle, ['x', 'z']] = target_x, target_z + model.body_quat[self._receptacle, ['qw', 'qy']] = [ + np.cos(target_angle/2), np.sin(target_angle/2)] + else: + target_angle = uniform(-np.pi, np.pi) + + model.body_pos[self._target, ['x', 'z']] = target_x, target_z + model.body_quat[self._target, ['qw', 'qy']] = [ + np.cos(target_angle/2), np.sin(target_angle/2)] + + # Randomise object location. + object_init_probs = [_P_IN_HAND, _P_IN_TARGET, 1-_P_IN_HAND-_P_IN_TARGET] + init_type = choice(['in_hand', 'in_target', 'uniform'], + p=object_init_probs) + if init_type == 'in_target': + object_x = target_x + object_z = target_z + object_angle = target_angle + elif init_type == 'in_hand': + physics.after_reset() + object_x = data.site_xpos['grasp', 'x'] + object_z = data.site_xpos['grasp', 'z'] + grasp_direction = data.site_xmat['grasp', ['xx', 'zx']] + object_angle = np.pi-np.arctan2(grasp_direction[1], grasp_direction[0]) + else: + object_x = uniform(-.5, .5) + object_z = uniform(0, .7) + object_angle = uniform(0, 2*np.pi) + data.qvel[self._object + '_x'] = uniform(-5, 5) + + data.qpos[self._object_joints] = object_x, object_z, object_angle + + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + + super(Bring, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns either features or only sensors (to be used with pixels).""" + obs = collections.OrderedDict() + obs['arm_pos'] = physics.bounded_joint_pos(_ARM_JOINTS) + obs['arm_vel'] = physics.joint_vel(_ARM_JOINTS) + obs['touch'] = physics.touch() + if self._fully_observable: + obs['hand_pos'] = physics.body_2d_pose('hand') + obs['object_pos'] = physics.body_2d_pose(self._object) + obs['object_vel'] = physics.joint_vel(self._object_joints) + obs['target_pos'] = physics.body_2d_pose(self._target) + return obs + + def _is_close(self, distance): + return rewards.tolerance(distance, (0, _CLOSE), _CLOSE*2) + + def _peg_reward(self, physics): + """Returns a reward for bringing the peg prop to the target.""" + grasp = self._is_close(physics.site_distance('peg_grasp', 'grasp')) + pinch = self._is_close(physics.site_distance('peg_pinch', 'pinch')) + grasping = (grasp + pinch) / 2 + bring = self._is_close(physics.site_distance('peg', 'target_peg')) + bring_tip = self._is_close(physics.site_distance('target_peg_tip', + 'peg_tip')) + bringing = (bring + bring_tip) / 2 + return max(bringing, grasping/3) + + def _ball_reward(self, physics): + """Returns a reward for bringing the ball prop to the target.""" + return self._is_close(physics.site_distance('ball', 'target_ball')) + + def get_reward(self, physics): + """Returns a reward to the agent.""" + if self._use_peg: + return self._peg_reward(physics) + else: + return self._ball_reward(physics) diff --git a/Dreamer/local_dm_control_suite/manipulator.xml b/Dreamer/local_dm_control_suite/manipulator.xml new file mode 100755 index 0000000..d6d1767 --- /dev/null +++ b/Dreamer/local_dm_control_suite/manipulator.xml @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + > + + diff --git a/Dreamer/local_dm_control_suite/pendulum.py b/Dreamer/local_dm_control_suite/pendulum.py new file mode 100755 index 0000000..38f442b --- /dev/null +++ b/Dreamer/local_dm_control_suite/pendulum.py @@ -0,0 +1,114 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Pendulum domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + + +_DEFAULT_TIME_LIMIT = 20 +_ANGLE_BOUND = 8 +_COSINE_BOUND = np.cos(np.deg2rad(_ANGLE_BOUND)) +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('pendulum.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns pendulum swingup task .""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = SwingUp(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Pendulum domain.""" + + def pole_vertical(self): + """Returns vertical (z) component of pole frame.""" + return self.named.data.xmat['pole', 'zz'] + + def angular_velocity(self): + """Returns the angular velocity of the pole.""" + return self.named.data.qvel['hinge'].copy() + + def pole_orientation(self): + """Returns both horizontal and vertical components of pole frame.""" + return self.named.data.xmat['pole', ['zz', 'xz']] + + +class SwingUp(base.Task): + """A Pendulum `Task` to swing up and balance the pole.""" + + def __init__(self, random=None): + """Initialize an instance of `Pendulum`. + + Args: + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + super(SwingUp, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Pole is set to a random angle between [-pi, pi). + + Args: + physics: An instance of `Physics`. + + """ + physics.named.data.qpos['hinge'] = self.random.uniform(-np.pi, np.pi) + super(SwingUp, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation. + + Observations are states concatenating pole orientation and angular velocity + and pixels from fixed camera. + + Args: + physics: An instance of `physics`, Pendulum physics. + + Returns: + A `dict` of observation. + """ + obs = collections.OrderedDict() + obs['orientation'] = physics.pole_orientation() + obs['velocity'] = physics.angular_velocity() + return obs + + def get_reward(self, physics): + return rewards.tolerance(physics.pole_vertical(), (_COSINE_BOUND, 1)) diff --git a/Dreamer/local_dm_control_suite/pendulum.xml b/Dreamer/local_dm_control_suite/pendulum.xml new file mode 100755 index 0000000..14377ae --- /dev/null +++ b/Dreamer/local_dm_control_suite/pendulum.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/point_mass.py b/Dreamer/local_dm_control_suite/point_mass.py new file mode 100755 index 0000000..b45ba17 --- /dev/null +++ b/Dreamer/local_dm_control_suite/point_mass.py @@ -0,0 +1,130 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Point-mass domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + +_DEFAULT_TIME_LIMIT = 20 +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('point_mass.xml'), common.ASSETS + + +@SUITE.add('benchmarking', 'easy') +def easy(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the easy point_mass task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = PointMass(randomize_gains=False, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add() +def hard(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the hard point_mass task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = PointMass(randomize_gains=True, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +class Physics(mujoco.Physics): + """physics for the point_mass domain.""" + + def mass_to_target(self): + """Returns the vector from mass to target in global coordinate.""" + return (self.named.data.geom_xpos['target'] - + self.named.data.geom_xpos['pointmass']) + + def mass_to_target_dist(self): + """Returns the distance from mass to the target.""" + return np.linalg.norm(self.mass_to_target()) + + +class PointMass(base.Task): + """A point_mass `Task` to reach target with smooth reward.""" + + def __init__(self, randomize_gains, random=None): + """Initialize an instance of `PointMass`. + + Args: + randomize_gains: A `bool`, whether to randomize the actuator gains. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._randomize_gains = randomize_gains + super(PointMass, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + If _randomize_gains is True, the relationship between the controls and + the joints is randomized, so that each control actuates a random linear + combination of joints. + + Args: + physics: An instance of `mujoco.Physics`. + """ + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + if self._randomize_gains: + dir1 = self.random.randn(2) + dir1 /= np.linalg.norm(dir1) + # Find another actuation direction that is not 'too parallel' to dir1. + parallel = True + while parallel: + dir2 = self.random.randn(2) + dir2 /= np.linalg.norm(dir2) + parallel = abs(np.dot(dir1, dir2)) > 0.9 + physics.model.wrap_prm[[0, 1]] = dir1 + physics.model.wrap_prm[[2, 3]] = dir2 + super(PointMass, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the state.""" + obs = collections.OrderedDict() + obs['position'] = physics.position() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + target_size = physics.named.model.geom_size['target', 0] + near_target = rewards.tolerance(physics.mass_to_target_dist(), + bounds=(0, target_size), margin=target_size) + control_reward = rewards.tolerance(physics.control(), margin=1, + value_at_margin=0, + sigmoid='quadratic').mean() + small_control = (control_reward + 4) / 5 + return near_target * small_control diff --git a/Dreamer/local_dm_control_suite/point_mass.xml b/Dreamer/local_dm_control_suite/point_mass.xml new file mode 100755 index 0000000..c447cf6 --- /dev/null +++ b/Dreamer/local_dm_control_suite/point_mass.xml @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/quadruped.py b/Dreamer/local_dm_control_suite/quadruped.py new file mode 100755 index 0000000..9e326d7 --- /dev/null +++ b/Dreamer/local_dm_control_suite/quadruped.py @@ -0,0 +1,480 @@ +# Copyright 2019 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Quadruped Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.mujoco.wrapper import mjbindings +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +from dm_control.utils import xml_tools + +from lxml import etree +import numpy as np +from scipy import ndimage + +enums = mjbindings.enums +mjlib = mjbindings.mjlib + + +_DEFAULT_TIME_LIMIT = 20 +_CONTROL_TIMESTEP = .02 + +# Horizontal speeds above which the move reward is 1. +_RUN_SPEED = 5 +_WALK_SPEED = 0.5 + +# Constants related to terrain generation. +_HEIGHTFIELD_ID = 0 +_TERRAIN_SMOOTHNESS = 0.15 # 0.0: maximally bumpy; 1.0: completely smooth. +_TERRAIN_BUMP_SCALE = 2 # Spatial scale of terrain bumps (in meters). + +# Named model elements. +_TOES = ['toe_front_left', 'toe_back_left', 'toe_back_right', 'toe_front_right'] +_WALLS = ['wall_px', 'wall_py', 'wall_nx', 'wall_ny'] + +SUITE = containers.TaggedTasks() + + +def make_model(floor_size=None, terrain=False, rangefinders=False, + walls_and_ball=False): + """Returns the model XML string.""" + xml_string = common.read_model('quadruped.xml') + parser = etree.XMLParser(remove_blank_text=True) + mjcf = etree.XML(xml_string, parser) + + # Set floor size. + if floor_size is not None: + floor_geom = mjcf.find('.//geom[@name={!r}]'.format('floor')) + floor_geom.attrib['size'] = '{} {} .5'.format(floor_size, floor_size) + + # Remove walls, ball and target. + if not walls_and_ball: + for wall in _WALLS: + wall_geom = xml_tools.find_element(mjcf, 'geom', wall) + wall_geom.getparent().remove(wall_geom) + + # Remove ball. + ball_body = xml_tools.find_element(mjcf, 'body', 'ball') + ball_body.getparent().remove(ball_body) + + # Remove target. + target_site = xml_tools.find_element(mjcf, 'site', 'target') + target_site.getparent().remove(target_site) + + # Remove terrain. + if not terrain: + terrain_geom = xml_tools.find_element(mjcf, 'geom', 'terrain') + terrain_geom.getparent().remove(terrain_geom) + + # Remove rangefinders if they're not used, as range computations can be + # expensive, especially in a scene with heightfields. + if not rangefinders: + rangefinder_sensors = mjcf.findall('.//rangefinder') + for rf in rangefinder_sensors: + rf.getparent().remove(rf) + + return etree.tostring(mjcf, pretty_print=True) + + +@SUITE.add() +def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Walk task.""" + xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED) + physics = Physics.from_xml_string(xml_string, common.ASSETS) + task = Move(desired_speed=_WALK_SPEED, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add() +def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Run task.""" + xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _RUN_SPEED) + physics = Physics.from_xml_string(xml_string, common.ASSETS) + task = Move(desired_speed=_RUN_SPEED, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add() +def escape(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns the Escape task.""" + xml_string = make_model(floor_size=40, terrain=True, rangefinders=True) + physics = Physics.from_xml_string(xml_string, common.ASSETS) + task = Escape(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add() +def fetch(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Fetch task.""" + xml_string = make_model(walls_and_ball=True) + physics = Physics.from_xml_string(xml_string, common.ASSETS) + task = Fetch(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment(physics, task, time_limit=time_limit, + control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Quadruped domain.""" + + def _reload_from_data(self, data): + super(Physics, self)._reload_from_data(data) + # Clear cached sensor names when the physics is reloaded. + self._sensor_types_to_names = {} + self._hinge_names = [] + + def _get_sensor_names(self, *sensor_types): + try: + sensor_names = self._sensor_types_to_names[sensor_types] + except KeyError: + [sensor_ids] = np.where(np.in1d(self.model.sensor_type, sensor_types)) + sensor_names = [self.model.id2name(s_id, 'sensor') for s_id in sensor_ids] + self._sensor_types_to_names[sensor_types] = sensor_names + return sensor_names + + def torso_upright(self): + """Returns the dot-product of the torso z-axis and the global z-axis.""" + return np.asarray(self.named.data.xmat['torso', 'zz']) + + def torso_velocity(self): + """Returns the velocity of the torso, in the local frame.""" + return self.named.data.sensordata['velocimeter'].copy() + + def egocentric_state(self): + """Returns the state without global orientation or position.""" + if not self._hinge_names: + [hinge_ids] = np.nonzero(self.model.jnt_type == + enums.mjtJoint.mjJNT_HINGE) + self._hinge_names = [self.model.id2name(j_id, 'joint') + for j_id in hinge_ids] + return np.hstack((self.named.data.qpos[self._hinge_names], + self.named.data.qvel[self._hinge_names], + self.data.act)) + + def toe_positions(self): + """Returns toe positions in egocentric frame.""" + torso_frame = self.named.data.xmat['torso'].reshape(3, 3) + torso_pos = self.named.data.xpos['torso'] + torso_to_toe = self.named.data.xpos[_TOES] - torso_pos + return torso_to_toe.dot(torso_frame) + + def force_torque(self): + """Returns scaled force/torque sensor readings at the toes.""" + force_torque_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_FORCE, + enums.mjtSensor.mjSENS_TORQUE) + return np.arcsinh(self.named.data.sensordata[force_torque_sensors]) + + def imu(self): + """Returns IMU-like sensor readings.""" + imu_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_GYRO, + enums.mjtSensor.mjSENS_ACCELEROMETER) + return self.named.data.sensordata[imu_sensors] + + def rangefinder(self): + """Returns scaled rangefinder sensor readings.""" + rf_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_RANGEFINDER) + rf_readings = self.named.data.sensordata[rf_sensors] + no_intersection = -1.0 + return np.where(rf_readings == no_intersection, 1.0, np.tanh(rf_readings)) + + def origin_distance(self): + """Returns the distance from the origin to the workspace.""" + return np.asarray(np.linalg.norm(self.named.data.site_xpos['workspace'])) + + def origin(self): + """Returns origin position in the torso frame.""" + torso_frame = self.named.data.xmat['torso'].reshape(3, 3) + torso_pos = self.named.data.xpos['torso'] + return -torso_pos.dot(torso_frame) + + def ball_state(self): + """Returns ball position and velocity relative to the torso frame.""" + data = self.named.data + torso_frame = data.xmat['torso'].reshape(3, 3) + ball_rel_pos = data.xpos['ball'] - data.xpos['torso'] + ball_rel_vel = data.qvel['ball_root'][:3] - data.qvel['root'][:3] + ball_rot_vel = data.qvel['ball_root'][3:] + ball_state = np.vstack((ball_rel_pos, ball_rel_vel, ball_rot_vel)) + return ball_state.dot(torso_frame).ravel() + + def target_position(self): + """Returns target position in torso frame.""" + torso_frame = self.named.data.xmat['torso'].reshape(3, 3) + torso_pos = self.named.data.xpos['torso'] + torso_to_target = self.named.data.site_xpos['target'] - torso_pos + return torso_to_target.dot(torso_frame) + + def ball_to_target_distance(self): + """Returns horizontal distance from the ball to the target.""" + ball_to_target = (self.named.data.site_xpos['target'] - + self.named.data.xpos['ball']) + return np.linalg.norm(ball_to_target[:2]) + + def self_to_ball_distance(self): + """Returns horizontal distance from the quadruped workspace to the ball.""" + self_to_ball = (self.named.data.site_xpos['workspace'] + -self.named.data.xpos['ball']) + return np.linalg.norm(self_to_ball[:2]) + + +def _find_non_contacting_height(physics, orientation, x_pos=0.0, y_pos=0.0): + """Find a height with no contacts given a body orientation. + + Args: + physics: An instance of `Physics`. + orientation: A quaternion. + x_pos: A float. Position along global x-axis. + y_pos: A float. Position along global y-axis. + Raises: + RuntimeError: If a non-contacting configuration has not been found after + 10,000 attempts. + """ + z_pos = 0.0 # Start embedded in the floor. + num_contacts = 1 + num_attempts = 0 + # Move up in 1cm increments until no contacts. + while num_contacts > 0: + try: + with physics.reset_context(): + physics.named.data.qpos['root'][:3] = x_pos, y_pos, z_pos + physics.named.data.qpos['root'][3:] = orientation + except control.PhysicsError: + # We may encounter a PhysicsError here due to filling the contact + # buffer, in which case we simply increment the height and continue. + pass + num_contacts = physics.data.ncon + z_pos += 0.01 + num_attempts += 1 + if num_attempts > 10000: + raise RuntimeError('Failed to find a non-contacting configuration.') + + +def _common_observations(physics): + """Returns the observations common to all tasks.""" + obs = collections.OrderedDict() + obs['egocentric_state'] = physics.egocentric_state() + obs['torso_velocity'] = physics.torso_velocity() + obs['torso_upright'] = physics.torso_upright() + obs['imu'] = physics.imu() + obs['force_torque'] = physics.force_torque() + return obs + + +def _upright_reward(physics, deviation_angle=0): + """Returns a reward proportional to how upright the torso is. + + Args: + physics: an instance of `Physics`. + deviation_angle: A float, in degrees. The reward is 0 when the torso is + exactly upside-down and 1 when the torso's z-axis is less than + `deviation_angle` away from the global z-axis. + """ + deviation = np.cos(np.deg2rad(deviation_angle)) + return rewards.tolerance( + physics.torso_upright(), + bounds=(deviation, float('inf')), + sigmoid='linear', + margin=1 + deviation, + value_at_margin=0) + + +class Move(base.Task): + """A quadruped task solved by moving forward at a designated speed.""" + + def __init__(self, desired_speed, random=None): + """Initializes an instance of `Move`. + + Args: + desired_speed: A float. If this value is zero, reward is given simply + for standing upright. Otherwise this specifies the horizontal velocity + at which the velocity-dependent reward component is maximized. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._desired_speed = desired_speed + super(Move, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Args: + physics: An instance of `Physics`. + + """ + # Initial configuration. + orientation = self.random.randn(4) + orientation /= np.linalg.norm(orientation) + _find_non_contacting_height(physics, orientation) + super(Move, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation to the agent.""" + return _common_observations(physics) + + def get_reward(self, physics): + """Returns a reward to the agent.""" + + # Move reward term. + move_reward = rewards.tolerance( + physics.torso_velocity()[0], + bounds=(self._desired_speed, float('inf')), + margin=self._desired_speed, + value_at_margin=0.5, + sigmoid='linear') + + return _upright_reward(physics) * move_reward + + +class Escape(base.Task): + """A quadruped task solved by escaping a bowl-shaped terrain.""" + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Args: + physics: An instance of `Physics`. + + """ + # Get heightfield resolution, assert that it is square. + res = physics.model.hfield_nrow[_HEIGHTFIELD_ID] + assert res == physics.model.hfield_ncol[_HEIGHTFIELD_ID] + # Sinusoidal bowl shape. + row_grid, col_grid = np.ogrid[-1:1:res*1j, -1:1:res*1j] + radius = np.clip(np.sqrt(col_grid**2 + row_grid**2), .04, 1) + bowl_shape = .5 - np.cos(2*np.pi*radius)/2 + # Random smooth bumps. + terrain_size = 2 * physics.model.hfield_size[_HEIGHTFIELD_ID, 0] + bump_res = int(terrain_size / _TERRAIN_BUMP_SCALE) + bumps = self.random.uniform(_TERRAIN_SMOOTHNESS, 1, (bump_res, bump_res)) + smooth_bumps = ndimage.zoom(bumps, res / float(bump_res)) + # Terrain is elementwise product. + terrain = bowl_shape * smooth_bumps + start_idx = physics.model.hfield_adr[_HEIGHTFIELD_ID] + physics.model.hfield_data[start_idx:start_idx+res**2] = terrain.ravel() + super(Escape, self).initialize_episode(physics) + + # If we have a rendering context, we need to re-upload the modified + # heightfield data. + if physics.contexts: + with physics.contexts.gl.make_current() as ctx: + ctx.call(mjlib.mjr_uploadHField, + physics.model.ptr, + physics.contexts.mujoco.ptr, + _HEIGHTFIELD_ID) + + # Initial configuration. + orientation = self.random.randn(4) + orientation /= np.linalg.norm(orientation) + _find_non_contacting_height(physics, orientation) + + def get_observation(self, physics): + """Returns an observation to the agent.""" + obs = _common_observations(physics) + obs['origin'] = physics.origin() + obs['rangefinder'] = physics.rangefinder() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + + # Escape reward term. + terrain_size = physics.model.hfield_size[_HEIGHTFIELD_ID, 0] + escape_reward = rewards.tolerance( + physics.origin_distance(), + bounds=(terrain_size, float('inf')), + margin=terrain_size, + value_at_margin=0, + sigmoid='linear') + + return _upright_reward(physics, deviation_angle=20) * escape_reward + + +class Fetch(base.Task): + """A quadruped task solved by bringing a ball to the origin.""" + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Args: + physics: An instance of `Physics`. + + """ + # Initial configuration, random azimuth and horizontal position. + azimuth = self.random.uniform(0, 2*np.pi) + orientation = np.array((np.cos(azimuth/2), 0, 0, np.sin(azimuth/2))) + spawn_radius = 0.9 * physics.named.model.geom_size['floor', 0] + x_pos, y_pos = self.random.uniform(-spawn_radius, spawn_radius, size=(2,)) + _find_non_contacting_height(physics, orientation, x_pos, y_pos) + + # Initial ball state. + physics.named.data.qpos['ball_root'][:2] = self.random.uniform( + -spawn_radius, spawn_radius, size=(2,)) + physics.named.data.qpos['ball_root'][2] = 2 + physics.named.data.qvel['ball_root'][:2] = 5*self.random.randn(2) + super(Fetch, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation to the agent.""" + obs = _common_observations(physics) + obs['ball_state'] = physics.ball_state() + obs['target_position'] = physics.target_position() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + + # Reward for moving close to the ball. + arena_radius = physics.named.model.geom_size['floor', 0] * np.sqrt(2) + workspace_radius = physics.named.model.site_size['workspace', 0] + ball_radius = physics.named.model.geom_size['ball', 0] + reach_reward = rewards.tolerance( + physics.self_to_ball_distance(), + bounds=(0, workspace_radius+ball_radius), + sigmoid='linear', + margin=arena_radius, value_at_margin=0) + + # Reward for bringing the ball to the target. + target_radius = physics.named.model.site_size['target', 0] + fetch_reward = rewards.tolerance( + physics.ball_to_target_distance(), + bounds=(0, target_radius), + sigmoid='linear', + margin=arena_radius, value_at_margin=0) + + reach_then_fetch = reach_reward * (0.5 + 0.5*fetch_reward) + + return _upright_reward(physics) * reach_then_fetch diff --git a/Dreamer/local_dm_control_suite/quadruped.xml b/Dreamer/local_dm_control_suite/quadruped.xml new file mode 100755 index 0000000..958d2c0 --- /dev/null +++ b/Dreamer/local_dm_control_suite/quadruped.xml @@ -0,0 +1,329 @@ + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/reacher.py b/Dreamer/local_dm_control_suite/reacher.py new file mode 100755 index 0000000..feea8b4 --- /dev/null +++ b/Dreamer/local_dm_control_suite/reacher.py @@ -0,0 +1,116 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Reacher domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +import numpy as np + +SUITE = containers.TaggedTasks() +_DEFAULT_TIME_LIMIT = 20 +_BIG_TARGET = .05 +_SMALL_TARGET = .015 + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('reacher.xml'), common.ASSETS + + +@SUITE.add('benchmarking', 'easy') +def easy(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns reacher with sparse reward with 5e-2 tol and randomized target.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Reacher(target_size=_BIG_TARGET, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +@SUITE.add('benchmarking') +def hard(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns reacher with sparse reward with 1e-2 tol and randomized target.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = Reacher(target_size=_SMALL_TARGET, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Reacher domain.""" + + def finger_to_target(self): + """Returns the vector from target to finger in global coordinates.""" + return (self.named.data.geom_xpos['target', :2] - + self.named.data.geom_xpos['finger', :2]) + + def finger_to_target_dist(self): + """Returns the signed distance between the finger and target surface.""" + return np.linalg.norm(self.finger_to_target()) + + +class Reacher(base.Task): + """A reacher `Task` to reach the target.""" + + def __init__(self, target_size, random=None): + """Initialize an instance of `Reacher`. + + Args: + target_size: A `float`, tolerance to determine whether finger reached the + target. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._target_size = target_size + super(Reacher, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + physics.named.model.geom_size['target', 0] = self._target_size + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + + # Randomize target position + angle = self.random.uniform(0, 2 * np.pi) + radius = self.random.uniform(.05, .20) + physics.named.model.geom_pos['target', 'x'] = radius * np.sin(angle) + physics.named.model.geom_pos['target', 'y'] = radius * np.cos(angle) + + super(Reacher, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of the state and the target position.""" + obs = collections.OrderedDict() + obs['position'] = physics.position() + obs['to_target'] = physics.finger_to_target() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + radii = physics.named.model.geom_size[['target', 'finger'], 0].sum() + return rewards.tolerance(physics.finger_to_target_dist(), (0, radii)) diff --git a/Dreamer/local_dm_control_suite/reacher.xml b/Dreamer/local_dm_control_suite/reacher.xml new file mode 100755 index 0000000..343f799 --- /dev/null +++ b/Dreamer/local_dm_control_suite/reacher.xml @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/stacker.py b/Dreamer/local_dm_control_suite/stacker.py new file mode 100755 index 0000000..6d4d49c --- /dev/null +++ b/Dreamer/local_dm_control_suite/stacker.py @@ -0,0 +1,208 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Planar Stacker domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.utils import containers +from dm_control.utils import rewards +from dm_control.utils import xml_tools + +from lxml import etree +import numpy as np + + +_CLOSE = .01 # (Meters) Distance below which a thing is considered close. +_CONTROL_TIMESTEP = .01 # (Seconds) +_TIME_LIMIT = 10 # (Seconds) +_ARM_JOINTS = ['arm_root', 'arm_shoulder', 'arm_elbow', 'arm_wrist', + 'finger', 'fingertip', 'thumb', 'thumbtip'] + +SUITE = containers.TaggedTasks() + + +def make_model(n_boxes): + """Returns a tuple containing the model XML string and a dict of assets.""" + xml_string = common.read_model('stacker.xml') + parser = etree.XMLParser(remove_blank_text=True) + mjcf = etree.XML(xml_string, parser) + + # Remove unused boxes + for b in range(n_boxes, 4): + box = xml_tools.find_element(mjcf, 'body', 'box' + str(b)) + box.getparent().remove(box) + + return etree.tostring(mjcf, pretty_print=True), common.ASSETS + + +@SUITE.add('hard') +def stack_2(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns stacker task with 2 boxes.""" + n_boxes = 2 + physics = Physics.from_xml_string(*make_model(n_boxes=n_boxes)) + task = Stack(n_boxes=n_boxes, + fully_observable=fully_observable, + random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +@SUITE.add('hard') +def stack_4(fully_observable=True, time_limit=_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns stacker task with 4 boxes.""" + n_boxes = 4 + physics = Physics.from_xml_string(*make_model(n_boxes=n_boxes)) + task = Stack(n_boxes=n_boxes, + fully_observable=fully_observable, + random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics with additional features for the Planar Manipulator domain.""" + + def bounded_joint_pos(self, joint_names): + """Returns joint positions as (sin, cos) values.""" + joint_pos = self.named.data.qpos[joint_names] + return np.vstack([np.sin(joint_pos), np.cos(joint_pos)]).T + + def joint_vel(self, joint_names): + """Returns joint velocities.""" + return self.named.data.qvel[joint_names] + + def body_2d_pose(self, body_names, orientation=True): + """Returns positions and/or orientations of bodies.""" + if not isinstance(body_names, str): + body_names = np.array(body_names).reshape(-1, 1) # Broadcast indices. + pos = self.named.data.xpos[body_names, ['x', 'z']] + if orientation: + ori = self.named.data.xquat[body_names, ['qw', 'qy']] + return np.hstack([pos, ori]) + else: + return pos + + def touch(self): + return np.log1p(self.data.sensordata) + + def site_distance(self, site1, site2): + site1_to_site2 = np.diff(self.named.data.site_xpos[[site2, site1]], axis=0) + return np.linalg.norm(site1_to_site2) + + +class Stack(base.Task): + """A Stack `Task`: stack the boxes.""" + + def __init__(self, n_boxes, fully_observable, random=None): + """Initialize an instance of the `Stack` task. + + Args: + n_boxes: An `int`, number of boxes to stack. + fully_observable: A `bool`, whether the observation should contain the + positions and velocities of the boxes and the location of the target. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._n_boxes = n_boxes + self._box_names = ['box' + str(b) for b in range(n_boxes)] + self._box_joint_names = [] + for name in self._box_names: + for dim in 'xyz': + self._box_joint_names.append('_'.join([name, dim])) + self._fully_observable = fully_observable + super(Stack, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode.""" + # Local aliases + randint = self.random.randint + uniform = self.random.uniform + model = physics.named.model + data = physics.named.data + + # Find a collision-free random initial configuration. + penetrating = True + while penetrating: + + # Randomise angles of arm joints. + is_limited = model.jnt_limited[_ARM_JOINTS].astype(np.bool) + joint_range = model.jnt_range[_ARM_JOINTS] + lower_limits = np.where(is_limited, joint_range[:, 0], -np.pi) + upper_limits = np.where(is_limited, joint_range[:, 1], np.pi) + angles = uniform(lower_limits, upper_limits) + data.qpos[_ARM_JOINTS] = angles + + # Symmetrize hand. + data.qpos['finger'] = data.qpos['thumb'] + + # Randomise target location. + target_height = 2*randint(self._n_boxes) + 1 + box_size = model.geom_size['target', 0] + model.body_pos['target', 'z'] = box_size * target_height + model.body_pos['target', 'x'] = uniform(-.37, .37) + + # Randomise box locations. + for name in self._box_names: + data.qpos[name + '_x'] = uniform(.1, .3) + data.qpos[name + '_z'] = uniform(0, .7) + data.qpos[name + '_y'] = uniform(0, 2*np.pi) + + # Check for collisions. + physics.after_reset() + penetrating = physics.data.ncon > 0 + + super(Stack, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns either features or only sensors (to be used with pixels).""" + obs = collections.OrderedDict() + obs['arm_pos'] = physics.bounded_joint_pos(_ARM_JOINTS) + obs['arm_vel'] = physics.joint_vel(_ARM_JOINTS) + obs['touch'] = physics.touch() + if self._fully_observable: + obs['hand_pos'] = physics.body_2d_pose('hand') + obs['box_pos'] = physics.body_2d_pose(self._box_names) + obs['box_vel'] = physics.joint_vel(self._box_joint_names) + obs['target_pos'] = physics.body_2d_pose('target', orientation=False) + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + box_size = physics.named.model.geom_size['target', 0] + min_box_to_target_distance = min(physics.site_distance(name, 'target') + for name in self._box_names) + box_is_close = rewards.tolerance(min_box_to_target_distance, + margin=2*box_size) + hand_to_target_distance = physics.site_distance('grasp', 'target') + hand_is_far = rewards.tolerance(hand_to_target_distance, + bounds=(.1, float('inf')), + margin=_CLOSE) + return box_is_close * hand_is_far diff --git a/Dreamer/local_dm_control_suite/stacker.xml b/Dreamer/local_dm_control_suite/stacker.xml new file mode 100755 index 0000000..7af4877 --- /dev/null +++ b/Dreamer/local_dm_control_suite/stacker.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + > + + diff --git a/Dreamer/local_dm_control_suite/swimmer.py b/Dreamer/local_dm_control_suite/swimmer.py new file mode 100755 index 0000000..96fd8ea --- /dev/null +++ b/Dreamer/local_dm_control_suite/swimmer.py @@ -0,0 +1,215 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Procedurally generated Swimmer domain.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards +from lxml import etree +import numpy as np +from six.moves import range + +_DEFAULT_TIME_LIMIT = 30 +_CONTROL_TIMESTEP = .03 # (Seconds) + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(n_joints): + """Returns a tuple containing the model XML string and a dict of assets. + + Args: + n_joints: An integer specifying the number of joints in the swimmer. + + Returns: + A tuple `(model_xml_string, assets)`, where `assets` is a dict consisting of + `{filename: contents_string}` pairs. + """ + return _make_model(n_joints), common.ASSETS + + +@SUITE.add('benchmarking') +def swimmer6(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns a 6-link swimmer.""" + return _make_swimmer(6, time_limit, random=random, + environment_kwargs=environment_kwargs) + + +@SUITE.add('benchmarking') +def swimmer15(time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns a 15-link swimmer.""" + return _make_swimmer(15, time_limit, random=random, + environment_kwargs=environment_kwargs) + + +def swimmer(n_links=3, time_limit=_DEFAULT_TIME_LIMIT, + random=None, environment_kwargs=None): + """Returns a swimmer with n links.""" + return _make_swimmer(n_links, time_limit, random=random, + environment_kwargs=environment_kwargs) + + +def _make_swimmer(n_joints, time_limit=_DEFAULT_TIME_LIMIT, random=None, + environment_kwargs=None): + """Returns a swimmer control environment.""" + model_string, assets = get_model_and_assets(n_joints) + physics = Physics.from_xml_string(model_string, assets=assets) + task = Swimmer(random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +def _make_model(n_bodies): + """Generates an xml string defining a swimmer with `n_bodies` bodies.""" + if n_bodies < 3: + raise ValueError('At least 3 bodies required. Received {}'.format(n_bodies)) + mjcf = etree.fromstring(common.read_model('swimmer.xml')) + head_body = mjcf.find('./worldbody/body') + actuator = etree.SubElement(mjcf, 'actuator') + sensor = etree.SubElement(mjcf, 'sensor') + + parent = head_body + for body_index in range(n_bodies - 1): + site_name = 'site_{}'.format(body_index) + child = _make_body(body_index=body_index) + child.append(etree.Element('site', name=site_name)) + joint_name = 'joint_{}'.format(body_index) + joint_limit = 360.0/n_bodies + joint_range = '{} {}'.format(-joint_limit, joint_limit) + child.append(etree.Element('joint', {'name': joint_name, + 'range': joint_range})) + motor_name = 'motor_{}'.format(body_index) + actuator.append(etree.Element('motor', name=motor_name, joint=joint_name)) + velocimeter_name = 'velocimeter_{}'.format(body_index) + sensor.append(etree.Element('velocimeter', name=velocimeter_name, + site=site_name)) + gyro_name = 'gyro_{}'.format(body_index) + sensor.append(etree.Element('gyro', name=gyro_name, site=site_name)) + parent.append(child) + parent = child + + # Move tracking cameras further away from the swimmer according to its length. + cameras = mjcf.findall('./worldbody/body/camera') + scale = n_bodies / 6.0 + for cam in cameras: + if cam.get('mode') == 'trackcom': + old_pos = cam.get('pos').split(' ') + new_pos = ' '.join([str(float(dim) * scale) for dim in old_pos]) + cam.set('pos', new_pos) + + return etree.tostring(mjcf, pretty_print=True) + + +def _make_body(body_index): + """Generates an xml string defining a single physical body.""" + body_name = 'segment_{}'.format(body_index) + visual_name = 'visual_{}'.format(body_index) + inertial_name = 'inertial_{}'.format(body_index) + body = etree.Element('body', name=body_name) + body.set('pos', '0 .1 0') + etree.SubElement(body, 'geom', {'class': 'visual', 'name': visual_name}) + etree.SubElement(body, 'geom', {'class': 'inertial', 'name': inertial_name}) + return body + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the swimmer domain.""" + + def nose_to_target(self): + """Returns a vector from nose to target in local coordinate of the head.""" + nose_to_target = (self.named.data.geom_xpos['target'] - + self.named.data.geom_xpos['nose']) + head_orientation = self.named.data.xmat['head'].reshape(3, 3) + return nose_to_target.dot(head_orientation)[:2] + + def nose_to_target_dist(self): + """Returns the distance from the nose to the target.""" + return np.linalg.norm(self.nose_to_target()) + + def body_velocities(self): + """Returns local body velocities: x,y linear, z rotational.""" + xvel_local = self.data.sensordata[12:].reshape((-1, 6)) + vx_vy_wz = [0, 1, 5] # Indices for linear x,y vels and rotational z vel. + return xvel_local[:, vx_vy_wz].ravel() + + def joints(self): + """Returns all internal joint angles (excluding root joints).""" + return self.data.qpos[3:].copy() + + +class Swimmer(base.Task): + """A swimmer `Task` to reach the target or just swim.""" + + def __init__(self, random=None): + """Initializes an instance of `Swimmer`. + + Args: + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + super(Swimmer, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + Initializes the swimmer orientation to [-pi, pi) and the relative joint + angle of each joint uniformly within its range. + + Args: + physics: An instance of `Physics`. + """ + # Random joint angles: + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + # Random target position. + close_target = self.random.rand() < .2 # Probability of a close target. + target_box = .3 if close_target else 2 + xpos, ypos = self.random.uniform(-target_box, target_box, size=2) + physics.named.model.geom_pos['target', 'x'] = xpos + physics.named.model.geom_pos['target', 'y'] = ypos + physics.named.model.light_pos['target_light', 'x'] = xpos + physics.named.model.light_pos['target_light', 'y'] = ypos + + super(Swimmer, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of joint angles, body velocities and target.""" + obs = collections.OrderedDict() + obs['joints'] = physics.joints() + obs['to_target'] = physics.nose_to_target() + obs['body_velocities'] = physics.body_velocities() + return obs + + def get_reward(self, physics): + """Returns a smooth reward.""" + target_size = physics.named.model.geom_size['target', 0] + return rewards.tolerance(physics.nose_to_target_dist(), + bounds=(0, target_size), + margin=5*target_size, + sigmoid='long_tail') diff --git a/Dreamer/local_dm_control_suite/swimmer.xml b/Dreamer/local_dm_control_suite/swimmer.xml new file mode 100755 index 0000000..29c7bc8 --- /dev/null +++ b/Dreamer/local_dm_control_suite/swimmer.xml @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dreamer/local_dm_control_suite/tests/domains_test.py b/Dreamer/local_dm_control_suite/tests/domains_test.py new file mode 100755 index 0000000..4c148cf --- /dev/null +++ b/Dreamer/local_dm_control_suite/tests/domains_test.py @@ -0,0 +1,292 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for dm_control.suite domains.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Internal dependencies. +from absl.testing import absltest +from absl.testing import parameterized +from dm_control import suite +from dm_control.rl import control +import mock +import numpy as np +import six +from six.moves import range +from six.moves import zip + + +def uniform_random_policy(action_spec, random=None): + lower_bounds = action_spec.minimum + upper_bounds = action_spec.maximum + # Draw values between -1 and 1 for unbounded actions. + lower_bounds = np.where(np.isinf(lower_bounds), -1.0, lower_bounds) + upper_bounds = np.where(np.isinf(upper_bounds), 1.0, upper_bounds) + random_state = np.random.RandomState(random) + def policy(time_step): + del time_step # Unused. + return random_state.uniform(lower_bounds, upper_bounds) + return policy + + +def step_environment(env, policy, num_episodes=5, max_steps_per_episode=10): + for _ in range(num_episodes): + step_count = 0 + time_step = env.reset() + yield time_step + while not time_step.last(): + action = policy(time_step) + time_step = env.step(action) + step_count += 1 + yield time_step + if step_count >= max_steps_per_episode: + break + + +def make_trajectory(domain, task, seed, **trajectory_kwargs): + env = suite.load(domain, task, task_kwargs={'random': seed}) + policy = uniform_random_policy(env.action_spec(), random=seed) + return step_environment(env, policy, **trajectory_kwargs) + + +class DomainTest(parameterized.TestCase): + """Tests run on all the tasks registered.""" + + def test_constants(self): + num_tasks = sum(len(tasks) for tasks in + six.itervalues(suite.TASKS_BY_DOMAIN)) + + self.assertLen(suite.ALL_TASKS, num_tasks) + + def _validate_observation(self, observation_dict, observation_spec): + obs = observation_dict.copy() + for name, spec in six.iteritems(observation_spec): + arr = obs.pop(name) + self.assertEqual(arr.shape, spec.shape) + self.assertEqual(arr.dtype, spec.dtype) + self.assertTrue( + np.all(np.isfinite(arr)), + msg='{!r} has non-finite value(s): {!r}'.format(name, arr)) + self.assertEmpty( + obs, + msg='Observation contains arrays(s) that are not in the spec: {!r}' + .format(obs)) + + def _validate_reward_range(self, time_step): + if time_step.first(): + self.assertIsNone(time_step.reward) + else: + self.assertIsInstance(time_step.reward, float) + self.assertBetween(time_step.reward, 0, 1) + + def _validate_discount(self, time_step): + if time_step.first(): + self.assertIsNone(time_step.discount) + else: + self.assertIsInstance(time_step.discount, float) + self.assertBetween(time_step.discount, 0, 1) + + def _validate_control_range(self, lower_bounds, upper_bounds): + for b in lower_bounds: + self.assertEqual(b, -1.0) + for b in upper_bounds: + self.assertEqual(b, 1.0) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_components_have_names(self, domain, task): + env = suite.load(domain, task) + model = env.physics.model + + object_types_and_size_fields = [ + ('body', 'nbody'), + ('joint', 'njnt'), + ('geom', 'ngeom'), + ('site', 'nsite'), + ('camera', 'ncam'), + ('light', 'nlight'), + ('mesh', 'nmesh'), + ('hfield', 'nhfield'), + ('texture', 'ntex'), + ('material', 'nmat'), + ('equality', 'neq'), + ('tendon', 'ntendon'), + ('actuator', 'nu'), + ('sensor', 'nsensor'), + ('numeric', 'nnumeric'), + ('text', 'ntext'), + ('tuple', 'ntuple'), + ] + for object_type, size_field in object_types_and_size_fields: + for idx in range(getattr(model, size_field)): + object_name = model.id2name(idx, object_type) + self.assertNotEqual(object_name, '', + msg='Model {!r} contains unnamed {!r} with ID {}.' + .format(model.name, object_type, idx)) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_model_has_at_least_2_cameras(self, domain, task): + env = suite.load(domain, task) + model = env.physics.model + self.assertGreaterEqual(model.ncam, 2, + 'Model {!r} should have at least 2 cameras, has {}.' + .format(model.name, model.ncam)) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_task_conforms_to_spec(self, domain, task): + """Tests that the environment timesteps conform to specifications.""" + is_benchmark = (domain, task) in suite.BENCHMARKING + env = suite.load(domain, task) + observation_spec = env.observation_spec() + action_spec = env.action_spec() + + # Check action bounds. + if is_benchmark: + self._validate_control_range(action_spec.minimum, action_spec.maximum) + + # Step through the environment, applying random actions sampled within the + # valid range and check the observations, rewards, and discounts. + policy = uniform_random_policy(action_spec) + for time_step in step_environment(env, policy): + self._validate_observation(time_step.observation, observation_spec) + self._validate_discount(time_step) + if is_benchmark: + self._validate_reward_range(time_step) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_environment_is_deterministic(self, domain, task): + """Tests that identical seeds and actions produce identical trajectories.""" + seed = 0 + # Iterate over two trajectories generated using identical sequences of + # random actions, and with identical task random states. Check that the + # observations, rewards, discounts and step types are identical. + trajectory1 = make_trajectory(domain=domain, task=task, seed=seed) + trajectory2 = make_trajectory(domain=domain, task=task, seed=seed) + for time_step1, time_step2 in zip(trajectory1, trajectory2): + self.assertEqual(time_step1.step_type, time_step2.step_type) + self.assertEqual(time_step1.reward, time_step2.reward) + self.assertEqual(time_step1.discount, time_step2.discount) + for key in six.iterkeys(time_step1.observation): + np.testing.assert_array_equal( + time_step1.observation[key], time_step2.observation[key], + err_msg='Observation {!r} is not equal.'.format(key)) + + def assertCorrectColors(self, physics, reward): + colors = physics.named.model.mat_rgba + for material_name in ('self', 'effector', 'target'): + highlight = colors[material_name + '_highlight'] + default = colors[material_name + '_default'] + blend_coef = reward ** 4 + expected = blend_coef * highlight + (1.0 - blend_coef) * default + actual = colors[material_name] + err_msg = ('Material {!r} has unexpected color.\nExpected: {!r}\n' + 'Actual: {!r}'.format(material_name, expected, actual)) + np.testing.assert_array_almost_equal(expected, actual, err_msg=err_msg) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_visualize_reward(self, domain, task): + env = suite.load(domain, task) + env.task.visualize_reward = True + action = np.zeros(env.action_spec().shape) + + with mock.patch.object(env.task, 'get_reward') as mock_get_reward: + mock_get_reward.return_value = -3.0 # Rewards < 0 should be clipped. + env.reset() + mock_get_reward.assert_called_with(env.physics) + self.assertCorrectColors(env.physics, reward=0.0) + + mock_get_reward.reset_mock() + mock_get_reward.return_value = 0.5 + env.step(action) + mock_get_reward.assert_called_with(env.physics) + self.assertCorrectColors(env.physics, reward=mock_get_reward.return_value) + + mock_get_reward.reset_mock() + mock_get_reward.return_value = 2.0 # Rewards > 1 should be clipped. + env.step(action) + mock_get_reward.assert_called_with(env.physics) + self.assertCorrectColors(env.physics, reward=1.0) + + mock_get_reward.reset_mock() + mock_get_reward.return_value = 0.25 + env.reset() + mock_get_reward.assert_called_with(env.physics) + self.assertCorrectColors(env.physics, reward=mock_get_reward.return_value) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_task_supports_environment_kwargs(self, domain, task): + env = suite.load(domain, task, + environment_kwargs=dict(flat_observation=True)) + # Check that the kwargs are actually passed through to the environment. + self.assertSetEqual(set(env.observation_spec()), + {control.FLAT_OBSERVATION_KEY}) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_observation_arrays_dont_share_memory(self, domain, task): + env = suite.load(domain, task) + first_timestep = env.reset() + action = np.zeros(env.action_spec().shape) + second_timestep = env.step(action) + for name, first_array in six.iteritems(first_timestep.observation): + second_array = second_timestep.observation[name] + self.assertFalse( + np.may_share_memory(first_array, second_array), + msg='Consecutive observations of {!r} may share memory.'.format(name)) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_observations_dont_contain_constant_elements(self, domain, task): + env = suite.load(domain, task) + trajectory = make_trajectory(domain=domain, task=task, seed=0, + num_episodes=2, max_steps_per_episode=1000) + observations = {name: [] for name in env.observation_spec()} + for time_step in trajectory: + for name, array in six.iteritems(time_step.observation): + observations[name].append(array) + + failures = [] + + for name, array_list in six.iteritems(observations): + # Sampling random uniform actions generally isn't sufficient to trigger + # these touch sensors. + if (domain in ('manipulator', 'stacker') and name == 'touch' or + domain == 'quadruped' and name == 'force_torque'): + continue + stacked_arrays = np.array(array_list) + is_constant = np.all(stacked_arrays == stacked_arrays[0], axis=0) + has_constant_elements = ( + is_constant if np.isscalar(is_constant) else np.any(is_constant)) + if has_constant_elements: + failures.append((name, is_constant)) + + self.assertEmpty( + failures, + msg='The following observation(s) contain constant elements:\n{}' + .format('\n'.join(':\t'.join([name, str(is_constant)]) + for (name, is_constant) in failures))) + + @parameterized.parameters(*suite.ALL_TASKS) + def test_initial_state_is_randomized(self, domain, task): + env = suite.load(domain, task, task_kwargs={'random': 42}) + obs1 = env.reset().observation + obs2 = env.reset().observation + self.assertFalse( + all(np.all(obs1[k] == obs2[k]) for k in obs1), + 'Two consecutive initial states have identical observations.\n' + 'First: {}\nSecond: {}'.format(obs1, obs2)) + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/tests/loader_test.py b/Dreamer/local_dm_control_suite/tests/loader_test.py new file mode 100755 index 0000000..cbce4f5 --- /dev/null +++ b/Dreamer/local_dm_control_suite/tests/loader_test.py @@ -0,0 +1,52 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for the dm_control.suite loader.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Internal dependencies. + +from absl.testing import absltest + +from dm_control import suite +from dm_control.rl import control + + +class LoaderTest(absltest.TestCase): + + def test_load_without_kwargs(self): + env = suite.load('cartpole', 'swingup') + self.assertIsInstance(env, control.Environment) + + def test_load_with_kwargs(self): + env = suite.load('cartpole', 'swingup', + task_kwargs={'time_limit': 40, 'random': 99}) + self.assertIsInstance(env, control.Environment) + + +class LoaderConstantsTest(absltest.TestCase): + + def testSuiteConstants(self): + self.assertNotEmpty(suite.BENCHMARKING) + self.assertNotEmpty(suite.EASY) + self.assertNotEmpty(suite.HARD) + self.assertNotEmpty(suite.EXTRA) + + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/tests/lqr_test.py b/Dreamer/local_dm_control_suite/tests/lqr_test.py new file mode 100755 index 0000000..d6edcf0 --- /dev/null +++ b/Dreamer/local_dm_control_suite/tests/lqr_test.py @@ -0,0 +1,88 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests specific to the LQR domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import unittest + +# Internal dependencies. +from absl import logging + +from absl.testing import absltest +from absl.testing import parameterized + +from local_dm_control_suite import lqr +from local_dm_control_suite import lqr_solver + +import numpy as np +from six.moves import range + + +class LqrTest(parameterized.TestCase): + + @parameterized.named_parameters( + ('lqr_2_1', lqr.lqr_2_1), + ('lqr_6_2', lqr.lqr_6_2)) + def test_lqr_optimal_policy(self, make_env): + env = make_env() + p, k, beta = lqr_solver.solve(env) + self.assertPolicyisOptimal(env, p, k, beta) + + @parameterized.named_parameters( + ('lqr_2_1', lqr.lqr_2_1), + ('lqr_6_2', lqr.lqr_6_2)) + @unittest.skipUnless( + condition=lqr_solver.sp, + reason='scipy is not available, so non-scipy DARE solver is the default.') + def test_lqr_optimal_policy_no_scipy(self, make_env): + env = make_env() + old_sp = lqr_solver.sp + try: + lqr_solver.sp = None # Force the solver to use the non-scipy code path. + p, k, beta = lqr_solver.solve(env) + finally: + lqr_solver.sp = old_sp + self.assertPolicyisOptimal(env, p, k, beta) + + def assertPolicyisOptimal(self, env, p, k, beta): + tolerance = 1e-3 + n_steps = int(math.ceil(math.log10(tolerance) / math.log10(beta))) + logging.info('%d timesteps for %g convergence.', n_steps, tolerance) + total_loss = 0.0 + + timestep = env.reset() + initial_state = np.hstack((timestep.observation['position'], + timestep.observation['velocity'])) + logging.info('Measuring total cost over %d steps.', n_steps) + for _ in range(n_steps): + x = np.hstack((timestep.observation['position'], + timestep.observation['velocity'])) + # u = k*x is the optimal policy + u = k.dot(x) + total_loss += 1 - (timestep.reward or 0.0) + timestep = env.step(u) + + logging.info('Analytical expected total cost is .5*x^T*p*x.') + expected_loss = .5 * initial_state.T.dot(p).dot(initial_state) + logging.info('Comparing measured and predicted costs.') + np.testing.assert_allclose(expected_loss, total_loss, rtol=tolerance) + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/utils/__init__.py b/Dreamer/local_dm_control_suite/utils/__init__.py new file mode 100755 index 0000000..2ea19cf --- /dev/null +++ b/Dreamer/local_dm_control_suite/utils/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Utility functions used in the control suite.""" diff --git a/Dreamer/local_dm_control_suite/utils/parse_amc.py b/Dreamer/local_dm_control_suite/utils/parse_amc.py new file mode 100755 index 0000000..3cea2ab --- /dev/null +++ b/Dreamer/local_dm_control_suite/utils/parse_amc.py @@ -0,0 +1,251 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse and convert amc motion capture data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control.mujoco.wrapper import mjbindings +import numpy as np +from scipy import interpolate +from six.moves import range + +mjlib = mjbindings.mjlib + +MOCAP_DT = 1.0/120.0 +CONVERSION_LENGTH = 0.056444 + +_CMU_MOCAP_JOINT_ORDER = ( + 'root0', 'root1', 'root2', 'root3', 'root4', 'root5', 'lowerbackrx', + 'lowerbackry', 'lowerbackrz', 'upperbackrx', 'upperbackry', 'upperbackrz', + 'thoraxrx', 'thoraxry', 'thoraxrz', 'lowerneckrx', 'lowerneckry', + 'lowerneckrz', 'upperneckrx', 'upperneckry', 'upperneckrz', 'headrx', + 'headry', 'headrz', 'rclaviclery', 'rclaviclerz', 'rhumerusrx', + 'rhumerusry', 'rhumerusrz', 'rradiusrx', 'rwristry', 'rhandrx', 'rhandrz', + 'rfingersrx', 'rthumbrx', 'rthumbrz', 'lclaviclery', 'lclaviclerz', + 'lhumerusrx', 'lhumerusry', 'lhumerusrz', 'lradiusrx', 'lwristry', + 'lhandrx', 'lhandrz', 'lfingersrx', 'lthumbrx', 'lthumbrz', 'rfemurrx', + 'rfemurry', 'rfemurrz', 'rtibiarx', 'rfootrx', 'rfootrz', 'rtoesrx', + 'lfemurrx', 'lfemurry', 'lfemurrz', 'ltibiarx', 'lfootrx', 'lfootrz', + 'ltoesrx' +) + +Converted = collections.namedtuple('Converted', + ['qpos', 'qvel', 'time']) + + +def convert(file_name, physics, timestep): + """Converts the parsed .amc values into qpos and qvel values and resamples. + + Args: + file_name: The .amc file to be parsed and converted. + physics: The corresponding physics instance. + timestep: Desired output interval between resampled frames. + + Returns: + A namedtuple with fields: + `qpos`, a numpy array containing converted positional variables. + `qvel`, a numpy array containing converted velocity variables. + `time`, a numpy array containing the corresponding times. + """ + frame_values = parse(file_name) + joint2index = {} + for name in physics.named.data.qpos.axes.row.names: + joint2index[name] = physics.named.data.qpos.axes.row.convert_key_item(name) + index2joint = {} + for joint, index in joint2index.items(): + if isinstance(index, slice): + indices = range(index.start, index.stop) + else: + indices = [index] + for ii in indices: + index2joint[ii] = joint + + # Convert frame_values to qpos + amcvals2qpos_transformer = Amcvals2qpos(index2joint, _CMU_MOCAP_JOINT_ORDER) + qpos_values = [] + for frame_value in frame_values: + qpos_values.append(amcvals2qpos_transformer(frame_value)) + qpos_values = np.stack(qpos_values) # Time by nq + + # Interpolate/resample. + # Note: interpolate quaternions rather than euler angles (slerp). + # see https://en.wikipedia.org/wiki/Slerp + qpos_values_resampled = [] + time_vals = np.arange(0, len(frame_values)*MOCAP_DT - 1e-8, MOCAP_DT) + time_vals_new = np.arange(0, len(frame_values)*MOCAP_DT, timestep) + while time_vals_new[-1] > time_vals[-1]: + time_vals_new = time_vals_new[:-1] + + for i in range(qpos_values.shape[1]): + f = interpolate.splrep(time_vals, qpos_values[:, i]) + qpos_values_resampled.append(interpolate.splev(time_vals_new, f)) + + qpos_values_resampled = np.stack(qpos_values_resampled) # nq by ntime + + qvel_list = [] + for t in range(qpos_values_resampled.shape[1]-1): + p_tp1 = qpos_values_resampled[:, t + 1] + p_t = qpos_values_resampled[:, t] + qvel = [(p_tp1[:3]-p_t[:3])/ timestep, + mj_quat2vel(mj_quatdiff(p_t[3:7], p_tp1[3:7]), timestep), + (p_tp1[7:]-p_t[7:])/ timestep] + qvel_list.append(np.concatenate(qvel)) + + qvel_values_resampled = np.vstack(qvel_list).T + + return Converted(qpos_values_resampled, qvel_values_resampled, time_vals_new) + + +def parse(file_name): + """Parses the amc file format.""" + values = [] + fid = open(file_name, 'r') + line = fid.readline().strip() + frame_ind = 1 + first_frame = True + while True: + # Parse first frame. + if first_frame and line[0] == str(frame_ind): + first_frame = False + frame_ind += 1 + frame_vals = [] + while True: + line = fid.readline().strip() + if not line or line == str(frame_ind): + values.append(np.array(frame_vals, dtype=np.float)) + break + tokens = line.split() + frame_vals.extend(tokens[1:]) + # Parse other frames. + elif line == str(frame_ind): + frame_ind += 1 + frame_vals = [] + while True: + line = fid.readline().strip() + if not line or line == str(frame_ind): + values.append(np.array(frame_vals, dtype=np.float)) + break + tokens = line.split() + frame_vals.extend(tokens[1:]) + else: + line = fid.readline().strip() + if not line: + break + return values + + +class Amcvals2qpos(object): + """Callable that converts .amc values for a frame and to MuJoCo qpos format. + """ + + def __init__(self, index2joint, joint_order): + """Initializes a new Amcvals2qpos instance. + + Args: + index2joint: List of joint angles in .amc file. + joint_order: List of joint names in MuJoco MJCF. + """ + # Root is x,y,z, then quat. + # need to get indices of qpos that order for amc default order + self.qpos_root_xyz_ind = [0, 1, 2] + self.root_xyz_ransform = np.array( + [[1, 0, 0], [0, 0, -1], [0, 1, 0]]) * CONVERSION_LENGTH + self.qpos_root_quat_ind = [3, 4, 5, 6] + amc2qpos_transform = np.zeros((len(index2joint), len(joint_order))) + for i in range(len(index2joint)): + for j in range(len(joint_order)): + if index2joint[i] == joint_order[j]: + if 'rx' in index2joint[i]: + amc2qpos_transform[i][j] = 1 + elif 'ry' in index2joint[i]: + amc2qpos_transform[i][j] = 1 + elif 'rz' in index2joint[i]: + amc2qpos_transform[i][j] = 1 + self.amc2qpos_transform = amc2qpos_transform + + def __call__(self, amc_val): + """Converts a `.amc` frame to MuJoCo qpos format.""" + amc_val_rad = np.deg2rad(amc_val) + qpos = np.dot(self.amc2qpos_transform, amc_val_rad) + + # Root. + qpos[:3] = np.dot(self.root_xyz_ransform, amc_val[:3]) + qpos_quat = euler2quat(amc_val[3], amc_val[4], amc_val[5]) + qpos_quat = mj_quatprod(euler2quat(90, 0, 0), qpos_quat) + + for i, ind in enumerate(self.qpos_root_quat_ind): + qpos[ind] = qpos_quat[i] + + return qpos + + +def euler2quat(ax, ay, az): + """Converts euler angles to a quaternion. + + Note: rotation order is zyx + + Args: + ax: Roll angle (deg) + ay: Pitch angle (deg). + az: Yaw angle (deg). + + Returns: + A numpy array representing the rotation as a quaternion. + """ + r1 = az + r2 = ay + r3 = ax + + c1 = np.cos(np.deg2rad(r1 / 2)) + s1 = np.sin(np.deg2rad(r1 / 2)) + c2 = np.cos(np.deg2rad(r2 / 2)) + s2 = np.sin(np.deg2rad(r2 / 2)) + c3 = np.cos(np.deg2rad(r3 / 2)) + s3 = np.sin(np.deg2rad(r3 / 2)) + + q0 = c1 * c2 * c3 + s1 * s2 * s3 + q1 = c1 * c2 * s3 - s1 * s2 * c3 + q2 = c1 * s2 * c3 + s1 * c2 * s3 + q3 = s1 * c2 * c3 - c1 * s2 * s3 + + return np.array([q0, q1, q2, q3]) + + +def mj_quatprod(q, r): + quaternion = np.zeros(4) + mjlib.mju_mulQuat(quaternion, np.ascontiguousarray(q), + np.ascontiguousarray(r)) + return quaternion + + +def mj_quat2vel(q, dt): + vel = np.zeros(3) + mjlib.mju_quat2Vel(vel, np.ascontiguousarray(q), dt) + return vel + + +def mj_quatneg(q): + quaternion = np.zeros(4) + mjlib.mju_negQuat(quaternion, np.ascontiguousarray(q)) + return quaternion + + +def mj_quatdiff(source, target): + return mj_quatprod(mj_quatneg(source), np.ascontiguousarray(target)) diff --git a/Dreamer/local_dm_control_suite/utils/parse_amc_test.py b/Dreamer/local_dm_control_suite/utils/parse_amc_test.py new file mode 100755 index 0000000..c8a9052 --- /dev/null +++ b/Dreamer/local_dm_control_suite/utils/parse_amc_test.py @@ -0,0 +1,68 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for parse_amc utility.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +# Internal dependencies. + +from absl.testing import absltest +from local_dm_control_suite import humanoid_CMU +from dm_control.suite.utils import parse_amc + +from dm_control.utils import io as resources + +_TEST_AMC_PATH = resources.GetResourceFilename( + os.path.join(os.path.dirname(__file__), '../demos/zeros.amc')) + + +class ParseAMCTest(absltest.TestCase): + + def test_sizes_of_parsed_data(self): + + # Instantiate the humanoid environment. + env = humanoid_CMU.stand() + + # Parse and convert specified clip. + converted = parse_amc.convert( + _TEST_AMC_PATH, env.physics, env.control_timestep()) + + self.assertEqual(converted.qpos.shape[0], 63) + self.assertEqual(converted.qvel.shape[0], 62) + self.assertEqual(converted.time.shape[0], converted.qpos.shape[1]) + self.assertEqual(converted.qpos.shape[1], + converted.qvel.shape[1] + 1) + + # Parse and convert specified clip -- WITH SMALLER TIMESTEP + converted2 = parse_amc.convert( + _TEST_AMC_PATH, env.physics, 0.5 * env.control_timestep()) + + self.assertEqual(converted2.qpos.shape[0], 63) + self.assertEqual(converted2.qvel.shape[0], 62) + self.assertEqual(converted2.time.shape[0], converted2.qpos.shape[1]) + self.assertEqual(converted.qpos.shape[1], + converted.qvel.shape[1] + 1) + + # Compare sizes of parsed objects for different timesteps + self.assertEqual(converted.qpos.shape[1] * 2, converted2.qpos.shape[1]) + + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/utils/randomizers.py b/Dreamer/local_dm_control_suite/utils/randomizers.py new file mode 100755 index 0000000..30ec182 --- /dev/null +++ b/Dreamer/local_dm_control_suite/utils/randomizers.py @@ -0,0 +1,91 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Randomization functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from dm_control.mujoco.wrapper import mjbindings +import numpy as np +from six.moves import range + + +def random_limited_quaternion(random, limit): + """Generates a random quaternion limited to the specified rotations.""" + axis = random.randn(3) + axis /= np.linalg.norm(axis) + angle = random.rand() * limit + + quaternion = np.zeros(4) + mjbindings.mjlib.mju_axisAngle2Quat(quaternion, axis, angle) + + return quaternion + + +def randomize_limited_and_rotational_joints(physics, random=None): + """Randomizes the positions of joints defined in the physics body. + + The following randomization rules apply: + - Bounded joints (hinges or sliders) are sampled uniformly in the bounds. + - Unbounded hinges are samples uniformly in [-pi, pi] + - Quaternions for unlimited free joints and ball joints are sampled + uniformly on the unit 3-sphere. + - Quaternions for limited ball joints are sampled uniformly on a sector + of the unit 3-sphere. + - The linear degrees of freedom of free joints are not randomized. + + Args: + physics: Instance of 'Physics' class that holds a loaded model. + random: Optional instance of 'np.random.RandomState'. Defaults to the global + NumPy random state. + """ + random = random or np.random + + hinge = mjbindings.enums.mjtJoint.mjJNT_HINGE + slide = mjbindings.enums.mjtJoint.mjJNT_SLIDE + ball = mjbindings.enums.mjtJoint.mjJNT_BALL + free = mjbindings.enums.mjtJoint.mjJNT_FREE + + qpos = physics.named.data.qpos + + for joint_id in range(physics.model.njnt): + joint_name = physics.model.id2name(joint_id, 'joint') + joint_type = physics.model.jnt_type[joint_id] + is_limited = physics.model.jnt_limited[joint_id] + range_min, range_max = physics.model.jnt_range[joint_id] + + if is_limited: + if joint_type == hinge or joint_type == slide: + qpos[joint_name] = random.uniform(range_min, range_max) + + elif joint_type == ball: + qpos[joint_name] = random_limited_quaternion(random, range_max) + + else: + if joint_type == hinge: + qpos[joint_name] = random.uniform(-np.pi, np.pi) + + elif joint_type == ball: + quat = random.randn(4) + quat /= np.linalg.norm(quat) + qpos[joint_name] = quat + + elif joint_type == free: + quat = random.rand(4) + quat /= np.linalg.norm(quat) + qpos[joint_name][3:] = quat + diff --git a/Dreamer/local_dm_control_suite/utils/randomizers_test.py b/Dreamer/local_dm_control_suite/utils/randomizers_test.py new file mode 100755 index 0000000..8b9b72d --- /dev/null +++ b/Dreamer/local_dm_control_suite/utils/randomizers_test.py @@ -0,0 +1,164 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for randomizers.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Internal dependencies. +from absl.testing import absltest +from absl.testing import parameterized +from dm_control import mujoco +from dm_control.mujoco.wrapper import mjbindings +from dm_control.suite.utils import randomizers +import numpy as np +from six.moves import range + +mjlib = mjbindings.mjlib + + +class RandomizeUnlimitedJointsTest(parameterized.TestCase): + + def setUp(self): + self.rand = np.random.RandomState(100) + + def test_single_joint_of_each_type(self): + physics = mujoco.Physics.from_xml_string(""" + + + + + + + + + + + + + + + + + + + + + + + + + """) + + randomizers.randomize_limited_and_rotational_joints(physics, self.rand) + self.assertNotEqual(0., physics.named.data.qpos['hinge']) + self.assertNotEqual(0., physics.named.data.qpos['limited_hinge']) + self.assertNotEqual(0., physics.named.data.qpos['limited_slide']) + + self.assertNotEqual(0., np.sum(physics.named.data.qpos['ball'])) + self.assertNotEqual(0., np.sum(physics.named.data.qpos['limited_ball'])) + + self.assertNotEqual(0., np.sum(physics.named.data.qpos['free'][3:])) + + # Unlimited slide and the positional part of the free joint remains + # uninitialized. + self.assertEqual(0., physics.named.data.qpos['slide']) + self.assertEqual(0., np.sum(physics.named.data.qpos['free'][:3])) + + def test_multiple_joints_of_same_type(self): + physics = mujoco.Physics.from_xml_string(""" + + + + + + + + + """) + + randomizers.randomize_limited_and_rotational_joints(physics, self.rand) + self.assertNotEqual(0., physics.named.data.qpos['hinge_1']) + self.assertNotEqual(0., physics.named.data.qpos['hinge_2']) + self.assertNotEqual(0., physics.named.data.qpos['hinge_3']) + + self.assertNotEqual(physics.named.data.qpos['hinge_1'], + physics.named.data.qpos['hinge_2']) + + self.assertNotEqual(physics.named.data.qpos['hinge_2'], + physics.named.data.qpos['hinge_3']) + + self.assertNotEqual(physics.named.data.qpos['hinge_1'], + physics.named.data.qpos['hinge_3']) + + def test_unlimited_hinge_randomization_range(self): + physics = mujoco.Physics.from_xml_string(""" + + + + + + + """) + + for _ in range(10): + randomizers.randomize_limited_and_rotational_joints(physics, self.rand) + self.assertBetween(physics.named.data.qpos['hinge'], -np.pi, np.pi) + + def test_limited_1d_joint_limits_are_respected(self): + physics = mujoco.Physics.from_xml_string(""" + + + + + + + + + + + """) + + for _ in range(10): + randomizers.randomize_limited_and_rotational_joints(physics, self.rand) + self.assertBetween(physics.named.data.qpos['hinge'], + np.deg2rad(0), np.deg2rad(10)) + self.assertBetween(physics.named.data.qpos['slide'], 30, 50) + + def test_limited_ball_joint_are_respected(self): + physics = mujoco.Physics.from_xml_string(""" + + + + + + + """) + + body_axis = np.array([1., 0., 0.]) + joint_axis = np.zeros(3) + for _ in range(10): + randomizers.randomize_limited_and_rotational_joints(physics, self.rand) + + quat = physics.named.data.qpos['ball'] + mjlib.mju_rotVecQuat(joint_axis, body_axis, quat) + angle_cos = np.dot(body_axis, joint_axis) + self.assertGreater(angle_cos, 0.5) # cos(60) = 0.5 + + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/walker.py b/Dreamer/local_dm_control_suite/walker.py new file mode 100755 index 0000000..b7bfd58 --- /dev/null +++ b/Dreamer/local_dm_control_suite/walker.py @@ -0,0 +1,158 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Planar Walker Domain.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dm_control import mujoco +from dm_control.rl import control +from local_dm_control_suite import base +from local_dm_control_suite import common +from dm_control.suite.utils import randomizers +from dm_control.utils import containers +from dm_control.utils import rewards + + +_DEFAULT_TIME_LIMIT = 25 +_CONTROL_TIMESTEP = .025 + +# Minimal height of torso over foot above which stand reward is 1. +_STAND_HEIGHT = 1.2 + +# Horizontal speeds (meters/second) above which move reward is 1. +_WALK_SPEED = 1 +_RUN_SPEED = 8 + + +SUITE = containers.TaggedTasks() + + +def get_model_and_assets(): + """Returns a tuple containing the model XML string and a dict of assets.""" + return common.read_model('walker.xml'), common.ASSETS + + +@SUITE.add('benchmarking') +def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Stand task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = PlanarWalker(move_speed=0, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Walk task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = PlanarWalker(move_speed=_WALK_SPEED, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +@SUITE.add('benchmarking') +def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None): + """Returns the Run task.""" + physics = Physics.from_xml_string(*get_model_and_assets()) + task = PlanarWalker(move_speed=_RUN_SPEED, random=random) + environment_kwargs = environment_kwargs or {} + return control.Environment( + physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP, + **environment_kwargs) + + +class Physics(mujoco.Physics): + """Physics simulation with additional features for the Walker domain.""" + + def torso_upright(self): + """Returns projection from z-axes of torso to the z-axes of world.""" + return self.named.data.xmat['torso', 'zz'] + + def torso_height(self): + """Returns the height of the torso.""" + return self.named.data.xpos['torso', 'z'] + + def horizontal_velocity(self): + """Returns the horizontal velocity of the center-of-mass.""" + return self.named.data.sensordata['torso_subtreelinvel'][0] + + def orientations(self): + """Returns planar orientations of all bodies.""" + return self.named.data.xmat[1:, ['xx', 'xz']].ravel() + + +class PlanarWalker(base.Task): + """A planar walker task.""" + + def __init__(self, move_speed, random=None): + """Initializes an instance of `PlanarWalker`. + + Args: + move_speed: A float. If this value is zero, reward is given simply for + standing up. Otherwise this specifies a target horizontal velocity for + the walking task. + random: Optional, either a `numpy.random.RandomState` instance, an + integer seed for creating a new `RandomState`, or None to select a seed + automatically (default). + """ + self._move_speed = move_speed + super(PlanarWalker, self).__init__(random=random) + + def initialize_episode(self, physics): + """Sets the state of the environment at the start of each episode. + + In 'standing' mode, use initial orientation and small velocities. + In 'random' mode, randomize joint angles and let fall to the floor. + + Args: + physics: An instance of `Physics`. + + """ + randomizers.randomize_limited_and_rotational_joints(physics, self.random) + super(PlanarWalker, self).initialize_episode(physics) + + def get_observation(self, physics): + """Returns an observation of body orientations, height and velocites.""" + obs = collections.OrderedDict() + obs['orientations'] = physics.orientations() + obs['height'] = physics.torso_height() + obs['velocity'] = physics.velocity() + return obs + + def get_reward(self, physics): + """Returns a reward to the agent.""" + standing = rewards.tolerance(physics.torso_height(), + bounds=(_STAND_HEIGHT, float('inf')), + margin=_STAND_HEIGHT/2) + upright = (1 + physics.torso_upright()) / 2 + stand_reward = (3*standing + upright) / 4 + if self._move_speed == 0: + return stand_reward + else: + move_reward = rewards.tolerance(physics.horizontal_velocity(), + bounds=(self._move_speed, float('inf')), + margin=self._move_speed/2, + value_at_margin=0.5, + sigmoid='linear') + return stand_reward * (5*move_reward + 1) / 6 diff --git a/Dreamer/local_dm_control_suite/walker.xml b/Dreamer/local_dm_control_suite/walker.xml new file mode 100755 index 0000000..9509893 --- /dev/null +++ b/Dreamer/local_dm_control_suite/walker.xml @@ -0,0 +1,70 @@ + + + + + + diff --git a/Dreamer/local_dm_control_suite/wrappers/__init__.py b/Dreamer/local_dm_control_suite/wrappers/__init__.py new file mode 100755 index 0000000..f7e4a68 --- /dev/null +++ b/Dreamer/local_dm_control_suite/wrappers/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2018 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Environment wrappers used to extend or modify environment behaviour.""" diff --git a/Dreamer/local_dm_control_suite/wrappers/action_noise.py b/Dreamer/local_dm_control_suite/wrappers/action_noise.py new file mode 100755 index 0000000..dab9970 --- /dev/null +++ b/Dreamer/local_dm_control_suite/wrappers/action_noise.py @@ -0,0 +1,74 @@ +# Copyright 2018 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Wrapper control suite environments that adds Gaussian noise to actions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import dm_env +import numpy as np + + +_BOUNDS_MUST_BE_FINITE = ( + 'All bounds in `env.action_spec()` must be finite, got: {action_spec}') + + +class Wrapper(dm_env.Environment): + """Wraps a control environment and adds Gaussian noise to actions.""" + + def __init__(self, env, scale=0.01): + """Initializes a new action noise Wrapper. + + Args: + env: The control suite environment to wrap. + scale: The standard deviation of the noise, expressed as a fraction + of the max-min range for each action dimension. + + Raises: + ValueError: If any of the action dimensions of the wrapped environment are + unbounded. + """ + action_spec = env.action_spec() + if not (np.all(np.isfinite(action_spec.minimum)) and + np.all(np.isfinite(action_spec.maximum))): + raise ValueError(_BOUNDS_MUST_BE_FINITE.format(action_spec=action_spec)) + self._minimum = action_spec.minimum + self._maximum = action_spec.maximum + self._noise_std = scale * (action_spec.maximum - action_spec.minimum) + self._env = env + + def step(self, action): + noisy_action = action + self._env.task.random.normal(scale=self._noise_std) + # Clip the noisy actions in place so that they fall within the bounds + # specified by the `action_spec`. Note that MuJoCo implicitly clips out-of- + # bounds control inputs, but we also clip here in case the actions do not + # correspond directly to MuJoCo actuators, or if there are other wrapper + # layers that expect the actions to be within bounds. + np.clip(noisy_action, self._minimum, self._maximum, out=noisy_action) + return self._env.step(noisy_action) + + def reset(self): + return self._env.reset() + + def observation_spec(self): + return self._env.observation_spec() + + def action_spec(self): + return self._env.action_spec() + + def __getattr__(self, name): + return getattr(self._env, name) diff --git a/Dreamer/local_dm_control_suite/wrappers/action_noise_test.py b/Dreamer/local_dm_control_suite/wrappers/action_noise_test.py new file mode 100755 index 0000000..dcc5330 --- /dev/null +++ b/Dreamer/local_dm_control_suite/wrappers/action_noise_test.py @@ -0,0 +1,136 @@ +# Copyright 2018 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for the action noise wrapper.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Internal dependencies. +from absl.testing import absltest +from absl.testing import parameterized +from dm_control.rl import control +from dm_control.suite.wrappers import action_noise +from dm_env import specs +import mock +import numpy as np + + +class ActionNoiseTest(parameterized.TestCase): + + def make_action_spec(self, lower=(-1.,), upper=(1.,)): + lower, upper = np.broadcast_arrays(lower, upper) + return specs.BoundedArray( + shape=lower.shape, dtype=float, minimum=lower, maximum=upper) + + def make_mock_env(self, action_spec=None): + action_spec = action_spec or self.make_action_spec() + env = mock.Mock(spec=control.Environment) + env.action_spec.return_value = action_spec + return env + + def assertStepCalledOnceWithCorrectAction(self, env, expected_action): + # NB: `assert_called_once_with()` doesn't support numpy arrays. + env.step.assert_called_once() + actual_action = env.step.call_args_list[0][0][0] + np.testing.assert_array_equal(expected_action, actual_action) + + @parameterized.parameters([ + dict(lower=np.r_[-1., 0.], upper=np.r_[1., 2.], scale=0.05), + dict(lower=np.r_[-1., 0.], upper=np.r_[1., 2.], scale=0.), + dict(lower=np.r_[-1., 0.], upper=np.r_[-1., 0.], scale=0.05), + ]) + def test_step(self, lower, upper, scale): + seed = 0 + std = scale * (upper - lower) + expected_noise = np.random.RandomState(seed).normal(scale=std) + action = np.random.RandomState(seed).uniform(lower, upper) + expected_noisy_action = np.clip(action + expected_noise, lower, upper) + task = mock.Mock(spec=control.Task) + task.random = np.random.RandomState(seed) + action_spec = self.make_action_spec(lower=lower, upper=upper) + env = self.make_mock_env(action_spec=action_spec) + env.task = task + wrapped_env = action_noise.Wrapper(env, scale=scale) + time_step = wrapped_env.step(action) + self.assertStepCalledOnceWithCorrectAction(env, expected_noisy_action) + self.assertIs(time_step, env.step(expected_noisy_action)) + + @parameterized.named_parameters([ + dict(testcase_name='within_bounds', action=np.r_[-1.], noise=np.r_[0.1]), + dict(testcase_name='below_lower', action=np.r_[-1.], noise=np.r_[-0.1]), + dict(testcase_name='above_upper', action=np.r_[1.], noise=np.r_[0.1]), + ]) + def test_action_clipping(self, action, noise): + lower = -1. + upper = 1. + expected_noisy_action = np.clip(action + noise, lower, upper) + task = mock.Mock(spec=control.Task) + task.random = mock.Mock(spec=np.random.RandomState) + task.random.normal.return_value = noise + action_spec = self.make_action_spec(lower=lower, upper=upper) + env = self.make_mock_env(action_spec=action_spec) + env.task = task + wrapped_env = action_noise.Wrapper(env) + time_step = wrapped_env.step(action) + self.assertStepCalledOnceWithCorrectAction(env, expected_noisy_action) + self.assertIs(time_step, env.step(expected_noisy_action)) + + @parameterized.parameters([ + dict(lower=np.r_[-1., 0.], upper=np.r_[1., np.inf]), + dict(lower=np.r_[np.nan, 0.], upper=np.r_[1., 2.]), + ]) + def test_error_if_action_bounds_non_finite(self, lower, upper): + action_spec = self.make_action_spec(lower=lower, upper=upper) + env = self.make_mock_env(action_spec=action_spec) + with self.assertRaisesWithLiteralMatch( + ValueError, + action_noise._BOUNDS_MUST_BE_FINITE.format(action_spec=action_spec)): + _ = action_noise.Wrapper(env) + + def test_reset(self): + env = self.make_mock_env() + wrapped_env = action_noise.Wrapper(env) + time_step = wrapped_env.reset() + env.reset.assert_called_once_with() + self.assertIs(time_step, env.reset()) + + def test_observation_spec(self): + env = self.make_mock_env() + wrapped_env = action_noise.Wrapper(env) + observation_spec = wrapped_env.observation_spec() + env.observation_spec.assert_called_once_with() + self.assertIs(observation_spec, env.observation_spec()) + + def test_action_spec(self): + env = self.make_mock_env() + wrapped_env = action_noise.Wrapper(env) + # `env.action_spec()` is called in `Wrapper.__init__()` + env.action_spec.reset_mock() + action_spec = wrapped_env.action_spec() + env.action_spec.assert_called_once_with() + self.assertIs(action_spec, env.action_spec()) + + @parameterized.parameters(['task', 'physics', 'control_timestep']) + def test_getattr(self, attribute_name): + env = self.make_mock_env() + wrapped_env = action_noise.Wrapper(env) + attr = getattr(wrapped_env, attribute_name) + self.assertIs(attr, getattr(env, attribute_name)) + + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/local_dm_control_suite/wrappers/pixels.py b/Dreamer/local_dm_control_suite/wrappers/pixels.py new file mode 100755 index 0000000..0f55fff --- /dev/null +++ b/Dreamer/local_dm_control_suite/wrappers/pixels.py @@ -0,0 +1,120 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Wrapper that adds pixel observations to a control environment.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +import dm_env +from dm_env import specs + +STATE_KEY = 'state' + + +class Wrapper(dm_env.Environment): + """Wraps a control environment and adds a rendered pixel observation.""" + + def __init__(self, env, pixels_only=True, render_kwargs=None, + observation_key='pixels'): + """Initializes a new pixel Wrapper. + + Args: + env: The environment to wrap. + pixels_only: If True (default), the original set of 'state' observations + returned by the wrapped environment will be discarded, and the + `OrderedDict` of observations will only contain pixels. If False, the + `OrderedDict` will contain the original observations as well as the + pixel observations. + render_kwargs: Optional `dict` containing keyword arguments passed to the + `mujoco.Physics.render` method. + observation_key: Optional custom string specifying the pixel observation's + key in the `OrderedDict` of observations. Defaults to 'pixels'. + + Raises: + ValueError: If `env`'s observation spec is not compatible with the + wrapper. Supported formats are a single array, or a dict of arrays. + ValueError: If `env`'s observation already contains the specified + `observation_key`. + """ + if render_kwargs is None: + render_kwargs = {} + + wrapped_observation_spec = env.observation_spec() + + if isinstance(wrapped_observation_spec, specs.Array): + self._observation_is_dict = False + invalid_keys = set([STATE_KEY]) + elif isinstance(wrapped_observation_spec, collections.MutableMapping): + self._observation_is_dict = True + invalid_keys = set(wrapped_observation_spec.keys()) + else: + raise ValueError('Unsupported observation spec structure.') + + if not pixels_only and observation_key in invalid_keys: + raise ValueError('Duplicate or reserved observation key {!r}.' + .format(observation_key)) + + if pixels_only: + self._observation_spec = collections.OrderedDict() + elif self._observation_is_dict: + self._observation_spec = wrapped_observation_spec.copy() + else: + self._observation_spec = collections.OrderedDict() + self._observation_spec[STATE_KEY] = wrapped_observation_spec + + # Extend observation spec. + pixels = env.physics.render(**render_kwargs) + pixels_spec = specs.Array( + shape=pixels.shape, dtype=pixels.dtype, name=observation_key) + self._observation_spec[observation_key] = pixels_spec + + self._env = env + self._pixels_only = pixels_only + self._render_kwargs = render_kwargs + self._observation_key = observation_key + + def reset(self): + time_step = self._env.reset() + return self._add_pixel_observation(time_step) + + def step(self, action): + time_step = self._env.step(action) + return self._add_pixel_observation(time_step) + + def observation_spec(self): + return self._observation_spec + + def action_spec(self): + return self._env.action_spec() + + def _add_pixel_observation(self, time_step): + if self._pixels_only: + observation = collections.OrderedDict() + elif self._observation_is_dict: + observation = type(time_step.observation)(time_step.observation) + else: + observation = collections.OrderedDict() + observation[STATE_KEY] = time_step.observation + + pixels = self._env.physics.render(**self._render_kwargs) + observation[self._observation_key] = pixels + return time_step._replace(observation=observation) + + def __getattr__(self, name): + return getattr(self._env, name) diff --git a/Dreamer/local_dm_control_suite/wrappers/pixels_test.py b/Dreamer/local_dm_control_suite/wrappers/pixels_test.py new file mode 100755 index 0000000..26b7fc1 --- /dev/null +++ b/Dreamer/local_dm_control_suite/wrappers/pixels_test.py @@ -0,0 +1,133 @@ +# Copyright 2017 The dm_control Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Tests for the pixel wrapper.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +# Internal dependencies. +from absl.testing import absltest +from absl.testing import parameterized +from local_dm_control_suite import cartpole +from dm_control.suite.wrappers import pixels +import dm_env +from dm_env import specs + +import numpy as np + + +class FakePhysics(object): + + def render(self, *args, **kwargs): + del args + del kwargs + return np.zeros((4, 5, 3), dtype=np.uint8) + + +class FakeArrayObservationEnvironment(dm_env.Environment): + + def __init__(self): + self.physics = FakePhysics() + + def reset(self): + return dm_env.restart(np.zeros((2,))) + + def step(self, action): + del action + return dm_env.transition(0.0, np.zeros((2,))) + + def action_spec(self): + pass + + def observation_spec(self): + return specs.Array(shape=(2,), dtype=np.float) + + +class PixelsTest(parameterized.TestCase): + + @parameterized.parameters(True, False) + def test_dict_observation(self, pixels_only): + pixel_key = 'rgb' + + env = cartpole.swingup() + + # Make sure we are testing the right environment for the test. + observation_spec = env.observation_spec() + self.assertIsInstance(observation_spec, collections.OrderedDict) + + width = 320 + height = 240 + + # The wrapper should only add one observation. + wrapped = pixels.Wrapper(env, + observation_key=pixel_key, + pixels_only=pixels_only, + render_kwargs={'width': width, 'height': height}) + + wrapped_observation_spec = wrapped.observation_spec() + self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict) + + if pixels_only: + self.assertLen(wrapped_observation_spec, 1) + self.assertEqual([pixel_key], list(wrapped_observation_spec.keys())) + else: + expected_length = len(observation_spec) + 1 + self.assertLen(wrapped_observation_spec, expected_length) + expected_keys = list(observation_spec.keys()) + [pixel_key] + self.assertEqual(expected_keys, list(wrapped_observation_spec.keys())) + + # Check that the added spec item is consistent with the added observation. + time_step = wrapped.reset() + rgb_observation = time_step.observation[pixel_key] + wrapped_observation_spec[pixel_key].validate(rgb_observation) + + self.assertEqual(rgb_observation.shape, (height, width, 3)) + self.assertEqual(rgb_observation.dtype, np.uint8) + + @parameterized.parameters(True, False) + def test_single_array_observation(self, pixels_only): + pixel_key = 'depth' + + env = FakeArrayObservationEnvironment() + observation_spec = env.observation_spec() + self.assertIsInstance(observation_spec, specs.Array) + + wrapped = pixels.Wrapper(env, observation_key=pixel_key, + pixels_only=pixels_only) + wrapped_observation_spec = wrapped.observation_spec() + self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict) + + if pixels_only: + self.assertLen(wrapped_observation_spec, 1) + self.assertEqual([pixel_key], list(wrapped_observation_spec.keys())) + else: + self.assertLen(wrapped_observation_spec, 2) + self.assertEqual([pixels.STATE_KEY, pixel_key], + list(wrapped_observation_spec.keys())) + + time_step = wrapped.reset() + + depth_observation = time_step.observation[pixel_key] + wrapped_observation_spec[pixel_key].validate(depth_observation) + + self.assertEqual(depth_observation.shape, (4, 5, 3)) + self.assertEqual(depth_observation.dtype, np.uint8) + +if __name__ == '__main__': + absltest.main() diff --git a/Dreamer/models.py b/Dreamer/models.py new file mode 100644 index 0000000..3e7d592 --- /dev/null +++ b/Dreamer/models.py @@ -0,0 +1,297 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras import layers as tfkl +from tensorflow_probability import distributions as tfd +from tensorflow.keras.mixed_precision import experimental as prec + +import tools + + +class RSSM(tools.Module): + + def __init__(self, stoch=30, deter=200, hidden=200, act=tf.nn.elu): + super().__init__() + self._activation = act + self._stoch_size = stoch + self._deter_size = deter + self._hidden_size = hidden + self._cell = tfkl.GRUCell(self._deter_size) + + def initial(self, batch_size): + dtype = prec.global_policy().compute_dtype + return dict( + mean=tf.zeros([batch_size, self._stoch_size], dtype), + std=tf.zeros([batch_size, self._stoch_size], dtype), + stoch=tf.zeros([batch_size, self._stoch_size], dtype), + deter=self._cell.get_initial_state(None, batch_size, dtype)) + + + @tf.function + def observe(self, embed, action, state=None): + if state is None: + state = self.initial(tf.shape(action)[0]) + embed = tf.transpose(embed, [1, 0, 2]) + action = tf.transpose(action, [1, 0, 2]) + post, prior = tools.static_scan( + lambda prev, inputs: self.obs_step( + prev[0], *inputs), + (action, embed), (state, state)) + post = {k: tf.transpose(v, [1, 0, 2]) for k, v in post.items()} + prior = {k: tf.transpose(v, [1, 0, 2]) for k, v in prior.items()} + return post, prior + + @tf.function + def imagine(self, action, state=None): + if state is None: + state = self.initial(tf.shape(action)[0]) + assert isinstance(state, dict), state + action = tf.transpose(action, [1, 0, 2]) + prior = tools.static_scan(self.img_step, action, state) + prior = {k: tf.transpose(v, [1, 0, 2]) for k, v in prior.items()} + return prior + + def get_feat(self, state): + return tf.concat([state['stoch'], state['deter']], -1) + + def get_dist(self, state): + return tfd.MultivariateNormalDiag(state['mean'], state['std']) + + @tf.function + def obs_step(self, prev_state, prev_action, embed): + prior = self.img_step(prev_state, prev_action) + x = tf.concat([prior['deter'], embed], -1) + x = self.get('obs1', tfkl.Dense, self._hidden_size, + self._activation)(x) + x = self.get('obs2', tfkl.Dense, 2 * self._stoch_size, None)(x) + mean, std = tf.split(x, 2, -1) + std = tf.nn.softplus(std) + 0.1 + stoch = self.get_dist({'mean': mean, 'std': std}).sample() + post = {'mean': mean, 'std': std, + 'stoch': stoch, 'deter': prior['deter']} + return post, prior + + @tf.function + def img_step(self, prev_state, prev_action): + x = tf.concat([prev_state['stoch'], prev_action], -1) + x = self.get('img1', tfkl.Dense, self._hidden_size, + self._activation)(x) + x, deter = self._cell(x, [prev_state['deter']]) + deter = deter[0] # Keras wraps the state in a list. + x = self.get('img2', tfkl.Dense, self._hidden_size, + self._activation)(x) + x = self.get('img3', tfkl.Dense, 2 * self._stoch_size, None)(x) + mean, std = tf.split(x, 2, -1) + std = tf.nn.softplus(std) + 0.1 + stoch = self.get_dist({'mean': mean, 'std': std}).sample() + prior = {'mean': mean, 'std': std, 'stoch': stoch, 'deter': deter} + return prior + + +class ConvEncoder(tools.Module): + + def __init__(self, depth=32, act=tf.nn.relu, image_size=64): + self._act = act + self._depth = depth + self._image_size = image_size + + if image_size == 64: + self._outdim = 32 * self._depth + self._kernel_sizes = [4, 4, 4, 4] + elif image_size == 32: + self._outdim = 8 * self._depth + self._kernel_sizes = [3, 3, 3, 3] + elif image_size == 84: + self._outdim = 72 * self._depth + self._kernel_sizes = [4, 4, 4, 4] + else: + raise NotImplementedError + + def __call__(self, obs): + kwargs = dict(strides=2, activation=self._act) + x = tf.reshape(obs['image'], (-1,) + tuple(obs['image'].shape[-3:])) + x = self.get('h1', tfkl.Conv2D, 1 * self._depth, + self._kernel_sizes[0], **kwargs)(x) + x = self.get('h2', tfkl.Conv2D, 2 * self._depth, + self._kernel_sizes[1], **kwargs)(x) + x = self.get('h3', tfkl.Conv2D, 4 * self._depth, + self._kernel_sizes[2], **kwargs)(x) + x = self.get('h4', tfkl.Conv2D, 8 * self._depth, + self._kernel_sizes[3], **kwargs)(x) + shape = tf.concat([tf.shape(obs['image'])[:-3], [self._outdim]], 0) + return tf.reshape(x, shape) + + +class ConvDecoder(tools.Module): + + def __init__(self, depth=32, act=tf.nn.relu, shape=(64, 64, 3)): + self._act = act + self._depth = depth + self._shape = shape + + if shape[0] == 64: + self._outdim = 32 * self._depth + self._kernel_sizes = [5, 5, 6, 6] + elif shape[0] == 32: + self._outdim = 8 * self._depth + self._kernel_sizes = [3, 3, 3, 4] + elif shape[0] == 84: + self._outdim = 72 * self._depth + self._kernel_sizes = [7, 6, 6, 6] + else: + raise NotImplementedError + + def __call__(self, features): + kwargs = dict(strides=2, activation=self._act) + x = self.get('h1', tfkl.Dense, self._outdim, None)(features) + x = tf.reshape(x, [-1, 1, 1, self._outdim]) + x = self.get('h2', tfkl.Conv2DTranspose, + 4 * self._depth, self._kernel_sizes[0], **kwargs)(x) + x = self.get('h3', tfkl.Conv2DTranspose, + 2 * self._depth, self._kernel_sizes[1], **kwargs)(x) + x = self.get('h4', tfkl.Conv2DTranspose, + 1 * self._depth, self._kernel_sizes[2], **kwargs)(x) + x = self.get('h5', tfkl.Conv2DTranspose, + self._shape[-1], self._kernel_sizes[3], strides=2)(x) + mean = tf.reshape(x, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)) + + +class ConvDecoderMask(tools.Module): + + def __init__(self, depth=32, act=tf.nn.relu, shape=(64, 64, 3)): + self._act = act + self._depth = depth + self._shape = shape + + if shape[0] == 64: + self._outdim = 32 * self._depth + self._kernel_sizes = [5, 5, 6, 6] + elif shape[0] == 32: + self._outdim = 8 * self._depth + self._kernel_sizes = [3, 3, 3, 4] + elif shape[0] == 84: + self._outdim = 72 * self._depth + self._kernel_sizes = [7, 6, 6, 6] + else: + raise NotImplementedError + + def __call__(self, features): + kwargs = dict(strides=2, activation=self._act) + x = self.get('h1', tfkl.Dense, self._outdim, None)(features) + x = tf.reshape(x, [-1, 1, 1, self._outdim]) + x = self.get('h2', tfkl.Conv2DTranspose, + 4 * self._depth, self._kernel_sizes[0], **kwargs)(x) + x = self.get('h3', tfkl.Conv2DTranspose, + 2 * self._depth, self._kernel_sizes[1], **kwargs)(x) + x = self.get('h4', tfkl.Conv2DTranspose, + 1 * self._depth, self._kernel_sizes[2], **kwargs)(x) + x = self.get('h5', tfkl.Conv2DTranspose, + 3 + self._shape[-1], self._kernel_sizes[3], strides=2)(x) + mean, mask = tf.split(x, [3, 3], -1) + mean = tf.reshape(mean, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + mask = tf.reshape(mask, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), mask + + +class ConvDecoderMaskEnsemble(tools.Module): + """ + ensemble two convdecoder with outputs + """ + + def __init__(self, decoder1, decoder2, precision): + self._decoder1 = decoder1 + self._decoder2 = decoder2 + self._precision = 'float' + str(precision) + self._shape = decoder1._shape + + def __call__(self, feat1, feat2): + kwargs = dict(strides=1, activation=tf.nn.sigmoid) + pred1, mask1 = self._decoder1(feat1) + pred2, mask2 = self._decoder2(feat2) + mean1 = pred1.submodules[0].loc + mean2 = pred2.submodules[0].loc + mask_feat = tf.concat([mask1, mask2], -1) + mask = self.get('mask1', tfkl.Conv2D, 1, 1, **kwargs)(mask_feat) + mask_use1 = mask + mask_use2 = 1-mask + mean = mean1 * tf.cast(mask_use1, self._precision) + \ + mean2 * tf.cast(mask_use2, self._precision) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), pred1, pred2, tf.cast(mask_use1, self._precision) + + +class InverseDecoder(tools.Module): + + def __init__(self, shape, layers, units, act=tf.nn.elu): + self._shape = shape + self._layers = layers + self._units = units + self._act = act + + def __call__(self, features): + x = tf.concat([features[:, :-1], features[:, 1:]], -1) + for index in range(self._layers): + x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x) + x = self.get(f'hout', tfkl.Dense, np.prod(self._shape))(x) + return tfd.Independent(tfd.Normal(x, 1), 1) + + +class DenseDecoder(tools.Module): + + def __init__(self, shape, layers, units, dist='normal', act=tf.nn.elu): + self._shape = shape + self._layers = layers + self._units = units + self._dist = dist + self._act = act + + def __call__(self, features): + x = features + for index in range(self._layers): + x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x) + x = self.get(f'hout', tfkl.Dense, np.prod(self._shape))(x) + x = tf.reshape(x, tf.concat([tf.shape(features)[:-1], self._shape], 0)) + if self._dist == 'normal': + return tfd.Independent(tfd.Normal(x, 1), len(self._shape)) + if self._dist == 'binary': + return tfd.Independent(tfd.Bernoulli(x), len(self._shape)) + raise NotImplementedError(self._dist) + + +class ActionDecoder(tools.Module): + + def __init__( + self, size, layers, units, dist='tanh_normal', act=tf.nn.elu, + min_std=1e-4, init_std=5, mean_scale=5): + self._size = size + self._layers = layers + self._units = units + self._dist = dist + self._act = act + self._min_std = min_std + self._init_std = init_std + self._mean_scale = mean_scale + + def __call__(self, features): + raw_init_std = np.log(np.exp(self._init_std) - 1) + x = features + for index in range(self._layers): + x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x) + if self._dist == 'tanh_normal': + # https://www.desmos.com/calculator/rcmcf5jwe7 + x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x) + mean, std = tf.split(x, 2, -1) + mean = self._mean_scale * tf.tanh(mean / self._mean_scale) + std = tf.nn.softplus(std + raw_init_std) + self._min_std + dist = tfd.Normal(mean, std) + dist = tfd.TransformedDistribution(dist, tools.TanhBijector()) + dist = tfd.Independent(dist, 1) + dist = tools.SampleDist(dist) + elif self._dist == 'onehot': + x = self.get(f'hout', tfkl.Dense, self._size)(x) + dist = tools.OneHotDist(x) + else: + raise NotImplementedError(dist) + return dist diff --git a/Dreamer/run.py b/Dreamer/run.py new file mode 100644 index 0000000..842bacb --- /dev/null +++ b/Dreamer/run.py @@ -0,0 +1,121 @@ +import wrappers +import tools +import tensorflow as tf +import argparse +import functools +import yaml +import os +import pathlib +import sys + +from tensorflow.keras.mixed_precision import experimental as prec +from dreamers import Dreamer, SeparationDreamer, InverseDreamer +from env_tools import count_steps, make_env + +METHOD2DREAMER = { + 'dreamer': Dreamer, + 'tia': SeparationDreamer, + 'inverse': InverseDreamer +} + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['MUJOCO_GL'] = 'egl' +tf.get_logger().setLevel('ERROR') +sys.path.append(str(pathlib.Path(__file__).parent)) + + +def main(method, config): + + if method == 'separation': + config.logdir = os.path.join( + config.logdir, config.task, + 'separation' + '_' + str(config.disen_neg_rew_scale) + + '_' + str(config.disen_rec_scale), + str(config.seed)) + else: + config.logdir = os.path.join( + config.logdir, config.task, + method, + str(config.seed)) + + logdir = pathlib.Path(config.logdir) + logdir.mkdir(parents=True, exist_ok=True) + snapshot_dir = os.path.join(config.logdir, 'snapshots') + snapshot_dir = pathlib.Path(snapshot_dir) + snapshot_dir.mkdir(parents=True, exist_ok=True) + with open(os.path.join(config.logdir, 'config.yaml'), 'w') as f: + yaml.dump(vars(config), f, sort_keys=False) + + if config.gpu_growth: + for gpu in tf.config.experimental.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(gpu, True) + assert config.precision in (16, 32), config.precision + if config.precision == 16: + prec.set_policy(prec.Policy('mixed_float16')) + config.steps = int(config.steps) + config.logdir = logdir + print('Logdir', config.logdir) + + # Create environments. + datadir = config.logdir / 'episodes' + writer = tf.summary.create_file_writer( + str(config.logdir), max_queue=1000, flush_millis=20000) + writer.set_as_default() + train_envs = [wrappers.Async(lambda: make_env( + config, writer, 'train', datadir, config.video_dir, store=True), config.parallel) + for _ in range(config.envs)] + test_envs = [wrappers.Async(lambda: make_env( + config, writer, 'test', datadir, config.video_dir, store=False), config.parallel) + for _ in range(config.envs)] + actspace = train_envs[0].action_space + + # Prefill dataset with random episodes. + step = count_steps(datadir, config) + prefill = max(0, config.prefill - step) + print(f'Prefill dataset with {prefill} steps.') + def random_agent(o, d, _): return ([actspace.sample() for _ in d], None) + tools.simulate(random_agent, train_envs, prefill / config.action_repeat) + writer.flush() + + # Train and regularly evaluate the agent. + step = count_steps(datadir, config) + print(f'Simulating agent for {config.steps-step} steps.') + DreamerModel = METHOD2DREAMER[method] + agent = DreamerModel(config, datadir, actspace, writer) + if (config.logdir / 'variables.pkl').exists(): + print('Load checkpoint.') + agent.load(config.logdir / 'variables.pkl') + state = None + should_snapshot = tools.Every(config.snapshot_every) + while step < config.steps: + print('Start evaluation.') + tools.simulate( + functools.partial(agent, training=False), test_envs, episodes=1) + writer.flush() + print('Start collection.') + steps = config.eval_every // config.action_repeat + state = tools.simulate(agent, train_envs, steps, state=state) + step = count_steps(datadir, config) + agent.save(config.logdir / 'variables.pkl') + if should_snapshot(step): + agent.save(snapshot_dir / ('variables_' + str(step) + '.pkl')) + for env in train_envs + test_envs: + env.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--method', type=str, choices=['dreamer', 'inverse', 'tia'], required=True) + parser.add_argument('--configs', nargs='+', required=True) + args, remaining = parser.parse_known_args() + config_path = 'train_configs/' + args.method + '.yaml' + configs = yaml.safe_load( + (pathlib.Path(__file__).parent / config_path).read_text()) + config_ = {} + for name in args.configs: + config_.update(configs[name]) + parser = argparse.ArgumentParser() + for key, value in config_.items(): + parser.add_argument( + f'--{key}', type=tools.args_type(value), default=value) + main(args.method, parser.parse_args(remaining)) diff --git a/Dreamer/tools.py b/Dreamer/tools.py new file mode 100644 index 0000000..3bd8491 --- /dev/null +++ b/Dreamer/tools.py @@ -0,0 +1,467 @@ +import datetime +import io +import pathlib +import pickle +import re +import uuid + +import gym +import numpy as np +import tensorflow as tf +import tensorflow.compat.v1 as tf1 # pylint: disable=E +import tensorflow_probability as tfp +from tensorflow.keras.mixed_precision import experimental as prec +from tensorflow_probability import distributions as tfd + + +class AttrDict(dict): + + __setattr__ = dict.__setitem__ + __getattr__ = dict.__getitem__ + + def from_dict(self, src_dict): + for key in src_dict: + setattr(self, key, src_dict[key]) + + +class Module(tf.Module): + + def save(self, filename): + values = tf.nest.map_structure(lambda x: x.numpy(), self.variables) + with pathlib.Path(filename).open('wb') as f: + pickle.dump(values, f) + + def load(self, filename): + with pathlib.Path(filename).open('rb') as f: + values = pickle.load(f) + tf.nest.map_structure(lambda x, y: x.assign(y), self.variables, values) + + def get(self, name, ctor, *args, **kwargs): + # Create or get layer by name to avoid mentioning it in the constructor. + if not hasattr(self, '_modules'): + self._modules = {} + if name not in self._modules: + self._modules[name] = ctor(*args, **kwargs) + return self._modules[name] + + +def nest_summary(structure): + if isinstance(structure, dict): + return {k: nest_summary(v) for k, v in structure.items()} + if isinstance(structure, list): + return [nest_summary(v) for v in structure] + if hasattr(structure, 'shape'): + return str(structure.shape).replace(', ', 'x').strip('(), ') + return '?' + + +def graph_summary(writer, fn, step, *args): + def inner(*args): + tf.summary.experimental.set_step(step) + with writer.as_default(): + fn(*args) + return tf.numpy_function(inner, args, []) + + +def video_summary(name, video, step=None, fps=20): + if isinstance(name, type(np.zeros(1))): + name = str(name) + else: + name = name if isinstance(name, str) else name.decode('utf-8') + if np.issubdtype(video.dtype, np.floating): + video = np.clip(255 * video, 0, 255).astype(np.uint8) + B, T, H, W, C = video.shape + try: + frames = video.transpose((1, 2, 0, 3, 4)).reshape((T, H, B * W, C)) + summary = tf1.Summary() + image = tf1.Summary.Image(height=B * H, width=T * W, colorspace=C) + image.encoded_image_string = encode_gif(frames, fps) + summary.value.add(tag=name + '/gif', image=image) + tf.summary.experimental.write_raw_pb(summary.SerializeToString(), step) + except (IOError, OSError) as e: + print('GIF summaries require ffmpeg in $PATH.', e) + frames = video.transpose((0, 2, 1, 3, 4)).reshape((1, B * H, T * W, C)) + tf.summary.image(name + '/grid', frames, step) + + +def encode_gif(frames, fps): + from subprocess import Popen, PIPE + h, w, c = frames[0].shape + pxfmt = {1: 'gray', 3: 'rgb24'}[c] + cmd = ' '.join([ + f'ffmpeg -y -f rawvideo -vcodec rawvideo', + f'-r {fps:.02f} -s {w}x{h} -pix_fmt {pxfmt} -i - -filter_complex', + f'[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse', + f'-r {fps:.02f} -f gif -']) + proc = Popen(cmd.split(' '), stdin=PIPE, stdout=PIPE, stderr=PIPE) + for image in frames: + proc.stdin.write(image.tostring()) + out, err = proc.communicate() + if proc.returncode: + raise IOError('\n'.join([' '.join(cmd), err.decode('utf8')])) + del proc + return out + + +def simulate(agent, envs, steps=0, episodes=0, state=None): + # Initialize or unpack simulation state. + if state is None: + step, episode = 0, 0 + done = np.ones(len(envs), np.bool) + length = np.zeros(len(envs), np.int32) + obs = [None] * len(envs) + agent_state = None + else: + step, episode, done, length, obs, agent_state = state + while (steps and step < steps) or (episodes and episode < episodes): + # Reset envs if necessary. + if done.any(): + indices = [index for index, d in enumerate(done) if d] + promises = [envs[i].reset(blocking=False) for i in indices] + for index, promise in zip(indices, promises): + obs[index] = promise() + # Step agents. + # if use augmentation, need to modify dreamer.policy or here. + obs = {k: np.stack([o[k] for o in obs]) for k in obs[0]} + action, agent_state = agent(obs, done, agent_state) + action = np.array(action) + assert len(action) == len(envs) + # Step envs. + promises = [e.step(a, blocking=False) for e, a in zip(envs, action)] + obs, _, done = zip(*[p()[:3] for p in promises]) + obs = list(obs) + done = np.stack(done) + episode += int(done.sum()) + length += 1 + step += (done * length).sum() + length *= (1 - done) + # Return new state to allow resuming the simulation. + return (step - steps, episode - episodes, done, length, obs, agent_state) + + +def count_episodes(directory): + filenames = directory.glob('*.npz') + lengths = [int(n.stem.rsplit('-', 1)[-1]) - 1 for n in filenames] + episodes, steps = len(lengths), sum(lengths) + return episodes, steps + + +def save_episodes(directory, episodes): + directory = pathlib.Path(directory).expanduser() + directory.mkdir(parents=True, exist_ok=True) + timestamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') + for episode in episodes: + identifier = str(uuid.uuid4().hex) + length = len(episode['reward']) + filename = directory / f'{timestamp}-{identifier}-{length}.npz' + with io.BytesIO() as f1: + np.savez_compressed(f1, **episode) + f1.seek(0) + with filename.open('wb') as f2: + f2.write(f1.read()) + + +def load_episodes(directory, rescan, length=None, balance=False, seed=0): + directory = pathlib.Path(directory).expanduser() + random = np.random.RandomState(seed) + cache = {} + while True: + for filename in directory.glob('*.npz'): + if filename not in cache: + try: + with filename.open('rb') as f: + episode = np.load(f) + episode = {k: episode[k] for k in episode.keys()} + except Exception as e: + print(f'Could not load episode: {e}') + continue + cache[filename] = episode + keys = list(cache.keys()) + for index in random.choice(len(keys), rescan): + episode = cache[keys[index]] + if length: + total = len(next(iter(episode.values()))) + available = total - length + if available < 1: + print(f'Skipped short episode of length {available}({total}/{length}).') + continue + if balance: + index = min(random.randint(0, total), available) + else: + index = int(random.randint(0, available)) + episode = {k: v[index: index + length] + for k, v in episode.items()} + yield episode + +class DummyEnv: + + def __init__(self): + self._random = np.random.RandomState(seed=0) + self._step = None + + @property + def observation_space(self): + low = np.zeros([64, 64, 3], dtype=np.uint8) + high = 255 * np.ones([64, 64, 3], dtype=np.uint8) + spaces = {'image': gym.spaces.Box(low, high)} + return gym.spaces.Dict(spaces) + + @property + def action_space(self): + low = -np.ones([5], dtype=np.float32) + high = np.ones([5], dtype=np.float32) + return gym.spaces.Box(low, high) + + def reset(self): + self._step = 0 + obs = self.observation_space.sample() + return obs + + def step(self, action): + obs = self.observation_space.sample() + reward = self._random.uniform(0, 1) + self._step += 1 + done = self._step >= 1000 + info = {} + return obs, reward, done, info + + +class SampleDist: + + def __init__(self, dist, samples=100): + self._dist = dist + self._samples = samples + + @property + def name(self): + return 'SampleDist' + + def __getattr__(self, name): + return getattr(self._dist, name) + + def mean(self): + samples = self._dist.sample(self._samples) + return tf.reduce_mean(samples, 0) + + def mode(self): + sample = self._dist.sample(self._samples) + logprob = self._dist.log_prob(sample) + return tf.gather(sample, tf.argmax(logprob))[0] # pylint: disable=E + + def entropy(self): + sample = self._dist.sample(self._samples) + logprob = self.log_prob(sample) + return -tf.reduce_mean(logprob, 0) + + +class OneHotDist: + + def __init__(self, logits=None, probs=None): + self._dist = tfd.Categorical(logits=logits, probs=probs) + self._num_classes = self.mean().shape[-1] + self._dtype = prec.global_policy().compute_dtype + + @property + def name(self): + return 'OneHotDist' + + def __getattr__(self, name): + return getattr(self._dist, name) + + def prob(self, events): + indices = tf.argmax(events, axis=-1) + return self._dist.prob(indices) + + def log_prob(self, events): + indices = tf.argmax(events, axis=-1) + return self._dist.log_prob(indices) + + def mean(self): + return self._dist.probs_parameter() + + def mode(self): + return self._one_hot(self._dist.mode()) + + def sample(self, amount=None): + amount = [amount] if amount else [] + indices = self._dist.sample(*amount) + sample = self._one_hot(indices) + probs = self._dist.probs_parameter() + sample += tf.cast(probs - tf.stop_gradient(probs), self._dtype) + return sample + + def _one_hot(self, indices): + return tf.one_hot(indices, self._num_classes, dtype=self._dtype) # pylint: disable=E + + + +class TanhBijector(tfp.bijectors.Bijector): + + def __init__(self, validate_args=False, name='tanh'): + super().__init__( + forward_min_event_ndims=0, + validate_args=validate_args, + name=name) + + def _forward(self, x): + return tf.nn.tanh(x) + + def _inverse(self, y): + dtype = y.dtype + y = tf.cast(y, tf.float32) + y = tf.where( + tf.less_equal(tf.abs(y), 1.), + tf.clip_by_value(y, -0.99999997, 0.99999997), y) + y = tf.atanh(y) + y = tf.cast(y, dtype) + return y + + def _forward_log_det_jacobian(self, x): + log2 = tf.math.log(tf.constant(2.0, dtype=x.dtype)) + return 2.0 * (log2 - x - tf.nn.softplus(-2.0 * x)) + + +def lambda_return( + reward, value, pcont, bootstrap, lambda_, axis): + # Setting lambda=1 gives a discounted Monte Carlo return. + # Setting lambda=0 gives a fixed 1-step return. + assert reward.shape.ndims == value.shape.ndims, (reward.shape, value.shape) + if isinstance(pcont, (int, float)): + pcont = pcont * tf.ones_like(reward) + dims = list(range(reward.shape.ndims)) + dims = [axis] + dims[1:axis] + [0] + dims[axis + 1:] + if axis != 0: + reward = tf.transpose(reward, dims) + value = tf.transpose(value, dims) + pcont = tf.transpose(pcont, dims) + if bootstrap is None: + bootstrap = tf.zeros_like(value[-1]) + next_values = tf.concat([value[1:], bootstrap[None]], 0) + inputs = reward + pcont * next_values * (1 - lambda_) + returns = static_scan( + lambda agg, cur: cur[0] + cur[1] * lambda_ * agg, + (inputs, pcont), bootstrap, reverse=True) + if axis != 0: + returns = tf.transpose(returns, dims) + return returns + + +class Adam(tf.Module): + + def __init__(self, name, modules, lr, clip=None, wd=None, wdpattern=r'.*'): + self._name = name + self._modules = modules + self._clip = clip + self._wd = wd + self._wdpattern = wdpattern + self._opt = tf.optimizers.Adam(lr) + self._opt = prec.LossScaleOptimizer(self._opt, 'dynamic') + self._variables = None + + @property + def variables(self): + return self._opt.variables() + + def __call__(self, tape, loss): + if self._variables is None: + variables = [module.variables for module in self._modules] + self._variables = tf.nest.flatten(variables) + count = sum(np.prod(x.shape) for x in self._variables) + print(f'Found {count} {self._name} parameters.') + assert len(loss.shape) == 0, loss.shape + with tape: + loss = self._opt.get_scaled_loss(loss) + grads = tape.gradient(loss, self._variables) + grads = self._opt.get_unscaled_gradients(grads) + norm = tf.linalg.global_norm(grads) + if self._clip: + grads, _ = tf.clip_by_global_norm(grads, self._clip, norm) + if self._wd: + context = tf.distribute.get_replica_context() + context.merge_call(self._apply_weight_decay) + self._opt.apply_gradients(zip(grads, self._variables)) + return norm + + def _apply_weight_decay(self, strategy): + print('Applied weight decay to variables:') + for var in self._variables: + if re.search(self._wdpattern, self._name + '/' + var.name): + print('- ' + self._name + '/' + var.name) + strategy.extended.update(var, lambda var: self._wd * var) + + +def args_type(default): + if isinstance(default, bool): + return lambda x: bool(['False', 'True'].index(x)) + if isinstance(default, int): + return lambda x: float(x) if ('e' in x or '.' in x) else int(x) + if isinstance(default, pathlib.Path): + return lambda x: pathlib.Path(x).expanduser() + return type(default) + + +def static_scan(fn, inputs, start, reverse=False): + last = start + outputs = [[] for _ in tf.nest.flatten(start)] + indices = range(len(tf.nest.flatten(inputs)[0])) + if reverse: + indices = reversed(indices) + for index in indices: + inp = tf.nest.map_structure(lambda x: x[index], inputs) + last = fn(last, inp) + [o.append(l) for o, l in zip(outputs, tf.nest.flatten(last))] + if reverse: + outputs = [list(reversed(x)) for x in outputs] + outputs = [tf.stack(x, 0) for x in outputs] + return tf.nest.pack_sequence_as(start, outputs) + + +def _mnd_sample(self, sample_shape=(), seed=None, name='sample'): + return tf.random.normal( + tuple(sample_shape) + tuple(self.event_shape), + self.mean(), self.stddev(), self.dtype, seed, name) + + +tfd.MultivariateNormalDiag.sample = _mnd_sample + + +def _cat_sample(self, sample_shape=(), seed=None, name='sample'): + assert len(sample_shape) in (0, 1), sample_shape + assert len(self.logits_parameter().shape) == 2 + indices = tf.random.categorical( + self.logits_parameter(), sample_shape[0] if sample_shape else 1, + self.dtype, seed, name) + if not sample_shape: + indices = indices[..., 0] + return indices + + +tfd.Categorical.sample = _cat_sample + + +class Every: + + def __init__(self, every): + self._every = every + self._last = None + + def __call__(self, step): + if self._last is None: + self._last = step + return True + if step >= self._last + self._every: + self._last += self._every + return True + return False + + +class Once: + + def __init__(self): + self._once = True + + def __call__(self): + if self._once: + self._once = False + return True + return False diff --git a/Dreamer/train_configs/dreamer.yaml b/Dreamer/train_configs/dreamer.yaml new file mode 100644 index 0000000..1c8f68e --- /dev/null +++ b/Dreamer/train_configs/dreamer.yaml @@ -0,0 +1,73 @@ +dmc: + + logdir: ./ + video_dir: ./ + debug: False + seed: 0 + steps: 1000000.0 + snapshot_every: 20000.0 + eval_every: 5000.0 + log_every: 5000.0 + image_size: 64 + log_scalars: true + log_images: true + gpu_growth: true + precision: 16 + task: dmc_cheetah_run_driving + envs: 1 + parallel: none + action_repeat: 2 + time_limit: 1000 + prefill: 5000 + eval_noise: 0.0 + clip_rewards: none + deter_size: 283 + stoch_size: 42 + num_units: 400 + dense_act: elu + cnn_act: relu + cnn_depth: 45 + pcont: false + free_nats: 3.0 + kl_scale: 1.0 + pcont_scale: 10.0 + weight_decay: 0.0 + weight_decay_pattern: .* + batch_size: 50 + batch_length: 50 + train_every: 1000 + train_steps: 100 + pretrain: 100 + model_lr: 0.0006 + value_lr: 8.0e-05 + actor_lr: 8.0e-05 + grad_clip: 100.0 + dataset_balance: false + discount: 0.99 + disclam: 0.95 + horizon: 15 + action_dist: tanh_normal + action_init_std: 5.0 + expl: additive_gaussian + expl_amount: 0.3 + expl_decay: 0.0 + expl_min: 0.0 + +debug: + + debug: True + pretrain: 1 + prefill: 1 + train_steps: 1 + deter_size: 20 + stoch_size: 3 + num_units: 40 + cnn_depth: 3 + batch_size: 10 + batch_length: 20 + + + + + + diff --git a/Dreamer/train_configs/inverse.yaml b/Dreamer/train_configs/inverse.yaml new file mode 100644 index 0000000..57b6fdc --- /dev/null +++ b/Dreamer/train_configs/inverse.yaml @@ -0,0 +1,66 @@ +dmc: + + logdir: ./ + video_dir: ./ + debug: False + seed: 0 + steps: 1000000.0 + snapshot_every: 20000.0 + eval_every: 5000.0 + log_every: 5000.0 + image_size: 64 + log_scalars: true + log_images: false + gpu_growth: true + precision: 16 + task: dmc_cheetah_run_driving + envs: 1 + parallel: none + action_repeat: 2 + time_limit: 1000 + prefill: 5000 + eval_noise: 0.0 + clip_rewards: none + deter_size: 283 + stoch_size: 42 + num_units: 400 + dense_act: elu + cnn_act: relu + cnn_depth: 45 + pcont: false + free_nats: 3.0 + kl_scale: 1.0 + pcont_scale: 10.0 + weight_decay: 0.0 + weight_decay_pattern: .* + batch_size: 50 + batch_length: 50 + train_every: 1000 + train_steps: 100 + pretrain: 100 + model_lr: 0.0006 + value_lr: 8.0e-05 + actor_lr: 8.0e-05 + grad_clip: 100.0 + dataset_balance: false + discount: 0.99 + disclam: 0.95 + horizon: 15 + action_dist: tanh_normal + action_init_std: 5.0 + expl: additive_gaussian + expl_amount: 0.3 + expl_decay: 0.0 + expl_min: 0.0 + +debug: + + debug: True + pretrain: 1 + prefill: 1 + train_steps: 1 + batch_size: 10 + batch_length: 20 + + + diff --git a/Dreamer/train_configs/tia.yaml b/Dreamer/train_configs/tia.yaml new file mode 100644 index 0000000..ec4abc8 --- /dev/null +++ b/Dreamer/train_configs/tia.yaml @@ -0,0 +1,75 @@ +dmc: + + logdir: ./ + video_dir: ./ + debug: False + seed: 0 + steps: 1000000.0 + snapshot_every: 20000.0 + eval_every: 5000.0 + log_every: 5000.0 + image_size: 64 + log_scalars: true + log_images: true + gpu_growth: true + precision: 16 + task: dmc_cheetah_run_driving + envs: 1 + parallel: none + action_repeat: 2 + time_limit: 1000 + prefill: 5000 + eval_noise: 0.0 + clip_rewards: none + deter_size: 200 + stoch_size: 30 + disen_deter_size: 200 + disen_stoch_size: 30 + num_units: 400 + dense_act: elu + cnn_act: relu + cnn_depth: 26 + disen_cnn_depth: 26 + pcont: false + free_nats: 3.0 + num_reward_opt_iters: 20 + disen_neg_rew_scale: 20000.0 + disen_rec_scale: 1.5 + disen_kl_scale: 1.0 + kl_scale: 1.0 + reward_scale: 1.0 + pcont_scale: 10.0 + weight_decay: 0.0 + weight_decay_pattern: .* + batch_size: 50 + batch_length: 50 + train_every: 1000 + train_steps: 100 + pretrain: 100 + disen_reward_lr: 0.0006 + model_lr: 0.0006 + value_lr: 8.0e-05 + actor_lr: 8.0e-05 + grad_clip: 100.0 + dataset_balance: false + discount: 0.99 + disclam: 0.95 + horizon: 15 + action_dist: tanh_normal + action_init_std: 5.0 + expl: additive_gaussian + expl_amount: 0.3 + expl_decay: 0.0 + expl_min: 0.0 + +debug: + + debug: True + pretrain: 1 + prefill: 1 + train_steps: 1 + batch_size: 10 + batch_length: 20 + + + diff --git a/Dreamer/wrappers.py b/Dreamer/wrappers.py new file mode 100644 index 0000000..abdd142 --- /dev/null +++ b/Dreamer/wrappers.py @@ -0,0 +1,540 @@ +import atexit +import functools +import sys +import threading +import traceback + +import gym +import numpy as np +from PIL import Image +from collections import deque + +from numpy.core import overrides + + +class DMC2GYMWrapper: + + def __init__(self, env): + self._env = env + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def observation_space(self): + spaces = {} + spaces['image'] = gym.spaces.Box( + 0, 255, (self._env._height, self._env._width, 3,), dtype=np.uint8) + return gym.spaces.Dict(spaces) + + def step(self, action): + image, reward, done, info = self._env.step(action) + obs = {'image': image} + return obs, reward, done, info + + def reset(self): + image = self._env.reset() + obs = {'image': image} + return obs + + +class DeepMindControl: + + def __init__(self, name, size=(64, 64), camera=None): + domain, task = name.split('_', 1) + if domain == 'cup': # Only domain with multiple words. + domain = 'ball_in_cup' + if isinstance(domain, str): + from dm_control import suite + self._env = suite.load(domain, task) + else: + assert task is None + self._env = domain() + self._size = size + if camera is None: + camera = dict(quadruped=2).get(domain, 0) + self._camera = camera + + @property + def observation_space(self): + spaces = {} + for key, value in self._env.observation_spec().items(): + spaces[key] = gym.spaces.Box( + -np.inf, np.inf, value.shape, dtype=np.float32) + spaces['image'] = gym.spaces.Box( + 0, 255, self._size + (3,), dtype=np.uint8) + return gym.spaces.Dict(spaces) + + @property + def action_space(self): + spec = self._env.action_spec() + return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32) + + def step(self, action): + time_step = self._env.step(action) + obs = dict(time_step.observation) + obs['image'] = self.render() + reward = time_step.reward or 0 + done = time_step.last() + info = {'discount': np.array(time_step.discount, np.float32)} + return obs, reward, done, info + + def reset(self): + time_step = self._env.reset() + obs = dict(time_step.observation) + obs['image'] = self.render() + return obs + + def render(self, *args, **kwargs): + if kwargs.get('mode', 'rgb_array') != 'rgb_array': + raise ValueError("Only render mode 'rgb_array' is supported.") + return self._env.physics.render(*self._size, camera_id=self._camera) + + +class Atari: + + LOCK = threading.Lock() + + def __init__( + self, name, action_repeat=4, size=(84, 84), grayscale=True, noops=30, + life_done=False, sticky_actions=True): + import gym + version = 0 if sticky_actions else 4 + name = ''.join(word.title() for word in name.split('_')) + with self.LOCK: + self._env = gym.make('{}NoFrameskip-v{}'.format(name, version)) + self._action_repeat = action_repeat + self._size = size + self._grayscale = grayscale + self._noops = noops + self._life_done = life_done + self._lives = None + shape = self._env.observation_space.shape[:2] + \ + (() if grayscale else (3,)) + self._buffers = [np.empty(shape, dtype=np.uint8) for _ in range(2)] + self._random = np.random.RandomState(seed=None) + + @property + def observation_space(self): + shape = self._size + (1 if self._grayscale else 3,) + space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) + return gym.spaces.Dict({'image': space}) + + @property + def action_space(self): + return self._env.action_space + + def close(self): + return self._env.close() + + def reset(self): + with self.LOCK: + self._env.reset() + noops = self._random.randint(1, self._noops + 1) + for _ in range(noops): + done = self._env.step(0)[2] + if done: + with self.LOCK: + self._env.reset() + self._lives = self._env.ale.lives() + if self._grayscale: + self._env.ale.getScreenGrayscale(self._buffers[0]) + else: + self._env.ale.getScreenRGB2(self._buffers[0]) + self._buffers[1].fill(0) + return self._get_obs() + + def step(self, action): + total_reward = 0.0 + for step in range(self._action_repeat): + _, reward, done, info = self._env.step(action) + total_reward += reward + if self._life_done: + lives = self._env.ale.lives() + done = done or lives < self._lives + self._lives = lives + if done: + break + elif step >= self._action_repeat - 2: + index = step - (self._action_repeat - 2) + if self._grayscale: + self._env.ale.getScreenGrayscale(self._buffers[index]) + else: + self._env.ale.getScreenRGB2(self._buffers[index]) + obs = self._get_obs() + return obs, total_reward, done, info + + def render(self, mode): + return self._env.render(mode) + + def _get_obs(self): + if self._action_repeat > 1: + np.maximum(self._buffers[0], + self._buffers[1], out=self._buffers[0]) + image = np.array(Image.fromarray(self._buffers[0]).resize( + self._size, Image.BILINEAR)) + image = np.clip(image, 0, 255).astype(np.uint8) + image = image[:, :, None] if self._grayscale else image + return {'image': image} + + +class Collect: + + def __init__(self, env, callbacks=None, precision=32): + self._env = env + self._callbacks = callbacks or () + self._precision = precision + self._episode = None + + def __getattr__(self, name): + return getattr(self._env, name) + + def step(self, action): + obs, reward, done, info = self._env.step(action) + obs = {k: self._convert(v) for k, v in obs.items()} + transition = obs.copy() + transition['action'] = action + transition['reward'] = reward + transition['discount'] = info.get( + 'discount', np.array(1 - float(done))) + self._episode.append(transition) + if done: + episode = {k: [t[k] for t in self._episode] + for k in self._episode[0]} + episode = {k: self._convert(v) for k, v in episode.items()} + info['episode'] = episode + for callback in self._callbacks: + callback(episode) + return obs, reward, done, info + + def reset(self): + obs = self._env.reset() + transition = obs.copy() + transition['action'] = np.zeros(self._env.action_space.shape) + transition['reward'] = 0.0 + transition['discount'] = 1.0 + self._episode = [transition] + return obs + + def _convert(self, value): + value = np.array(value) + if np.issubdtype(value.dtype, np.floating): + dtype = {16: np.float16, 32: np.float32, + 64: np.float64}[self._precision] + elif np.issubdtype(value.dtype, np.signedinteger): + dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision] + elif np.issubdtype(value.dtype, np.uint8): + dtype = np.uint8 + else: + raise NotImplementedError(value.dtype) + return value.astype(dtype) + + +class TimeLimit: + + def __init__(self, env, duration): + self._env = env + self._duration = duration + self._step = None + + def __getattr__(self, name): + return getattr(self._env, name) + + def step(self, action): + assert self._step is not None, 'Must reset environment.' + obs, reward, done, info = self._env.step(action) + self._step += 1 + if self._step >= self._duration: + done = True + if 'discount' not in info: + info['discount'] = np.array(1.0).astype(np.float32) + self._step = None + return obs, reward, done, info + + def reset(self): + self._step = 0 + return self._env.reset() + + +class ActionRepeat: + + def __init__(self, env, amount): + self._env = env + self._amount = amount + + def __getattr__(self, name): + return getattr(self._env, name) + + def step(self, action): + done = False + total_reward = 0 + current_step = 0 + while current_step < self._amount and not done: + obs, reward, done, info = self._env.step(action) + total_reward += reward + current_step += 1 + return obs, total_reward, done, info + + +class NormalizeActions: + + def __init__(self, env): + self._env = env + self._mask = np.logical_and( + np.isfinite(env.action_space.low), + np.isfinite(env.action_space.high)) + self._low = np.where(self._mask, env.action_space.low, -1) + self._high = np.where(self._mask, env.action_space.high, 1) + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def action_space(self): + low = np.where(self._mask, -np.ones_like(self._low), self._low) + high = np.where(self._mask, np.ones_like(self._low), self._high) + return gym.spaces.Box(low, high, dtype=np.float32) + + def step(self, action): + original = (action + 1) / 2 * (self._high - self._low) + self._low + original = np.where(self._mask, original, action) + return self._env.step(original) + + +class ObsDict: + + def __init__(self, env, key='obs'): + self._env = env + self._key = key + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def observation_space(self): + spaces = {self._key: self._env.observation_space} + return gym.spaces.Dict(spaces) + + @property + def action_space(self): + return self._env.action_space + + def step(self, action): + obs, reward, done, info = self._env.step(action) + obs = {self._key: np.array(obs)} + return obs, reward, done, info + + def reset(self): + obs = self._env.reset() + obs = {self._key: np.array(obs)} + return obs + + +class OneHotAction: + + def __init__(self, env): + assert isinstance(env.action_space, gym.spaces.Discrete) + self._env = env + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def action_space(self): + shape = (self._env.action_space.n,) + space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32) + space.sample = self._sample_action + return space + + def step(self, action): + index = np.argmax(action).astype(int) + reference = np.zeros_like(action) + reference[index] = 1 + if not np.allclose(reference, action): + raise ValueError(f'Invalid one-hot action:\n{action}') + return self._env.step(index) + + def reset(self): + return self._env.reset() + + def _sample_action(self): + actions = self._env.action_space.n + index = self._random.randint(0, actions) + reference = np.zeros(actions, dtype=np.float32) + reference[index] = 1.0 + return reference + + +class RewardObs: + + def __init__(self, env): + self._env = env + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def observation_space(self): + spaces = self._env.observation_space.spaces + assert 'reward' not in spaces + spaces['reward'] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32) + return gym.spaces.Dict(spaces) + + def step(self, action): + obs, reward, done, info = self._env.step(action) + obs['reward'] = reward + return obs, reward, done, info + + def reset(self): + obs = self._env.reset() + obs['reward'] = 0.0 + return obs + + +class Async: + + _ACCESS = 1 + _CALL = 2 + _RESULT = 3 + _EXCEPTION = 4 + _CLOSE = 5 + + def __init__(self, ctor, strategy='process'): + self._strategy = strategy + if strategy == 'none': + self._env = ctor() + elif strategy == 'thread': + import multiprocessing.dummy as mp + elif strategy == 'process': + import multiprocessing as mp + else: + raise NotImplementedError(strategy) + if strategy != 'none': + self._conn, conn = mp.Pipe() + self._process = mp.Process(target=self._worker, args=(ctor, conn)) + atexit.register(self.close) + self._process.start() + self._obs_space = None + self._action_space = None + + @property + def observation_space(self): + if not self._obs_space: + self._obs_space = self.__getattr__('observation_space') + return self._obs_space + + @property + def action_space(self): + if not self._action_space: + self._action_space = self.__getattr__('action_space') + return self._action_space + + def __getattr__(self, name): + if self._strategy == 'none': + return getattr(self._env, name) + self._conn.send((self._ACCESS, name)) + return self._receive() + + def call(self, name, *args, **kwargs): + blocking = kwargs.pop('blocking', True) + if self._strategy == 'none': + return functools.partial(getattr(self._env, name), *args, **kwargs) + payload = name, args, kwargs + self._conn.send((self._CALL, payload)) + promise = self._receive + return promise() if blocking else promise + + def close(self): + if self._strategy == 'none': + try: + self._env.close() + except AttributeError: + pass + return + try: + self._conn.send((self._CLOSE, None)) + self._conn.close() + except IOError: + # The connection was already closed. + pass + self._process.join() + + def step(self, action, blocking=True): + return self.call('step', action, blocking=blocking) + + def reset(self, blocking=True): + return self.call('reset', blocking=blocking) + + def _receive(self): + try: + message, payload = self._conn.recv() + except ConnectionResetError: + raise RuntimeError('Environment worker crashed.') + # Re-raise exceptions in the main process. + if message == self._EXCEPTION: + stacktrace = payload + raise Exception(stacktrace) + if message == self._RESULT: + return payload + raise KeyError(f'Received message of unexpected type {message}') + + def _worker(self, ctor, conn): + try: + env = ctor() + while True: + try: + # Only block for short times to have keyboard exceptions be raised. + if not conn.poll(0.1): + continue + message, payload = conn.recv() + except (EOFError, KeyboardInterrupt): + break + if message == self._ACCESS: + name = payload + result = getattr(env, name) + conn.send((self._RESULT, result)) + continue + if message == self._CALL: + name, args, kwargs = payload + result = getattr(env, name)(*args, **kwargs) + conn.send((self._RESULT, result)) + continue + if message == self._CLOSE: + assert payload is None + break + raise KeyError(f'Received message of unknown type {message}') + except Exception: + stacktrace = ''.join(traceback.format_exception(*sys.exc_info())) + print(f'Error in environment process: {stacktrace}') + conn.send((self._EXCEPTION, stacktrace)) + conn.close() + + +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + gym.Wrapper.__init__(self, env) + self._k = k + self._frames = deque([], maxlen=k) + shp = env.observation_space.shape + self.observation_space = gym.spaces.Box( + low=0, + high=1, + shape=((shp[0] * k,) + shp[1:]), + dtype=env.observation_space.dtype + ) + self._max_episode_steps = env._max_episode_steps + + def reset(self): + obs = self.env.reset() + for _ in range(self._k): + self._frames.append(obs) + return self._get_obs() + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self._frames.append(obs) + return self._get_obs(), reward, done, info + + def _get_obs(self): + assert len(self._frames) == self._k + return np.concatenate(list(self._frames), axis=0) diff --git a/DreamerV2/configs.yaml b/DreamerV2/configs.yaml new file mode 100644 index 0000000..872e3ae --- /dev/null +++ b/DreamerV2/configs.yaml @@ -0,0 +1,185 @@ +defaults: + gpu: 'none' + logdir: ./ + traindir: null + evaldir: null + offline_traindir: '' + offline_evaldir: '' + seed: 0 + steps: 1e7 + eval_every: 1e4 + log_every: 1e4 + reset_every: 0 + gpu_growth: True + precision: 32 + debug: False + expl_gifs: False + + # Environment + task: 'dmc_walker_walk' + size: [64, 64] + envs: 1 + action_repeat: 2 + time_limit: 1000 + prefill: 2500 + eval_noise: 0.0 + clip_rewards: 'identity' + atari_grayscale: False + + # Model + dyn_cell: 'gru' + dyn_hidden: 200 + dyn_deter: 200 + dyn_stoch: 50 + dyn_discrete: 0 + dyn_input_layers: 1 + dyn_output_layers: 1 + dyn_shared: False + dyn_mean_act: 'none' + dyn_std_act: 'sigmoid2' + dyn_min_std: 0.1 + grad_heads: ['image', 'reward'] + units: 400 + reward_layers: 2 + discount_layers: 3 + value_layers: 3 + actor_layers: 4 + act: 'elu' + cnn_depth: 32 + encoder_kernels: [4, 4, 4, 4] + decoder_kernels: [5, 5, 6, 6] + decoder_thin: True + value_head: 'normal' + kl_scale: '1.0' + kl_balance: '0.8' + kl_free: '1.0' + pred_discount: False + discount_scale: 1.0 + reward_scale: 1.0 + weight_decay: 0.0 + + # Training + batch_size: 50 + batch_length: 50 + train_every: 5 + train_steps: 1 + pretrain: 100 + model_lr: 3e-4 + value_lr: 8e-5 + actor_lr: 8e-5 + opt_eps: 1e-5 + grad_clip: 100 + value_grad_clip: 100 + actor_grad_clip: 100 + dataset_size: 0 + oversample_ends: False + slow_value_target: True + slow_actor_target: True + slow_target_update: 100 + slow_target_fraction: 1 + opt: 'adam' + + # Behavior. + discount: 0.99 + discount_lambda: 0.95 + imag_horizon: 15 + imag_gradient: 'dynamics' + imag_gradient_mix: '0.1' + imag_sample: True + actor_dist: 'trunc_normal' + actor_entropy: '1e-4' + actor_state_entropy: 0.0 + actor_init_std: 1.0 + actor_min_std: 0.1 + actor_disc: 5 + actor_temp: 0.1 + actor_outscale: 0.0 + expl_amount: 0.0 + eval_state_mean: False + collect_dyn_sample: True + behavior_stop_grad: True + value_decay: 0.0 + future_entropy: False + + # Exploration + expl_behavior: 'greedy' + expl_until: 0 + expl_extr_scale: 0.0 + expl_intr_scale: 1.0 + disag_target: 'stoch' + disag_log: True + disag_models: 10 + disag_offset: 1 + disag_layers: 4 + disag_units: 400 + +atari: + + # General + task: 'atari_demon_attack' + steps: 3e7 + eval_every: 1e5 + log_every: 1e4 + prefill: 50000 + dataset_size: 2e6 + pretrain: 0 + precision: 16 + + # Environment + time_limit: 108000 # 30 minutes of game play. + atari_grayscale: True + action_repeat: 4 + eval_noise: 0.001 + train_every: 16 + train_steps: 1 + clip_rewards: 'tanh' + + # Model + grad_heads: ['image', 'reward', 'discount'] + dyn_cell: 'gru_layer_norm' + pred_discount: True + cnn_depth: 48 + dyn_deter: 600 + dyn_hidden: 600 + dyn_stoch: 32 + dyn_discrete: 32 + reward_layers: 4 + discount_layers: 4 + value_layers: 4 + actor_layers: 4 + + # Behavior + actor_dist: 'onehot' + actor_entropy: 'linear(3e-3,3e-4,2.5e6)' + expl_amount: 0.0 + expl_until: 3e7 + discount: 0.995 + imag_gradient: 'both' + imag_gradient_mix: 'linear(0.1,0,2.5e6)' + + # Training + discount_scale: 5.0 + reward_scale: 1 + weight_decay: 1e-6 + model_lr: 2e-4 + kl_scale: 0.1 + kl_free: 0.0 + actor_lr: 4e-5 + value_lr: 1e-4 + oversample_ends: True + + # Disen + disen_cnn_depth: 16 + disen_only_scale: 1.0 + disen_discount_scale: 2000.0 + disen_reward_scale: 2000.0 + num_reward_opt_iters: 20 + +debug: + + debug: True + pretrain: 1 + prefill: 1 + train_steps: 1 + batch_size: 10 + batch_length: 20 diff --git a/DreamerV2/dreamer.py b/DreamerV2/dreamer.py new file mode 100644 index 0000000..6883bfc --- /dev/null +++ b/DreamerV2/dreamer.py @@ -0,0 +1,316 @@ +import argparse +import collections +import functools +import os +import pathlib +import sys +import warnings + +warnings.filterwarnings('ignore', '.*box bound precision lowered.*') +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['MUJOCO_GL'] = 'egl' + +import numpy as np +import ruamel.yaml as yaml +import tensorflow as tf +from tensorflow.keras.mixed_precision import experimental as prec + +tf.get_logger().setLevel('ERROR') + +from tensorflow_probability import distributions as tfd + +sys.path.append(str(pathlib.Path(__file__).parent)) + +import exploration as expl +import models +import tools +import wrappers + +class Dreamer(tools.Module): + + def __init__(self, config, logger, dataset): + self._config = config + self._logger = logger + self._float = prec.global_policy().compute_dtype + self._should_log = tools.Every(config.log_every) + self._should_train = tools.Every(config.train_every) + self._should_pretrain = tools.Once() + self._should_reset = tools.Every(config.reset_every) + self._should_expl = tools.Until(int( + config.expl_until / config.action_repeat)) + self._metrics = collections.defaultdict(tf.metrics.Mean) + with tf.device('cpu:0'): + self._step = tf.Variable(count_steps(config.traindir), dtype=tf.int64) + # Schedules. + config.actor_entropy = ( + lambda x=config.actor_entropy: tools.schedule(x, self._step)) + config.actor_state_entropy = ( + lambda x=config.actor_state_entropy: tools.schedule(x, self._step)) + config.imag_gradient_mix = ( + lambda x=config.imag_gradient_mix: tools.schedule(x, self._step)) + self._dataset = iter(dataset) + self._wm = models.WorldModel(self._step, config) + self._task_behavior = models.ImagBehavior( + config, self._wm, config.behavior_stop_grad) + reward = lambda f, s, a: self._wm.heads['reward'](f).mode() + self._expl_behavior = dict( + greedy=lambda: self._task_behavior, + random=lambda: expl.Random(config), + plan2explore=lambda: expl.Plan2Explore(config, self._wm, reward), + )[config.expl_behavior]() + # Train step to initialize variables including optimizer statistics. + self._train(next(self._dataset)) + + def __call__(self, obs, reset, state=None, training=True): + step = self._step.numpy().item() + if self._should_reset(step): + state = None + if state is not None and reset.any(): + mask = tf.cast(1 - reset, self._float)[:, None] + state = tf.nest.map_structure(lambda x: x * mask, state) + if training and self._should_train(step): + steps = ( + self._config.pretrain if self._should_pretrain() + else self._config.train_steps) + for _ in range(steps): + self._train(next(self._dataset)) + if self._should_log(step): + for name, mean in self._metrics.items(): + self._logger.scalar(name, float(mean.result())) + mean.reset_states() + openl_joint, openl_main, openl_disen, openl_mask = self._wm.video_pred(next(self._dataset)) + self._logger.video('train_openl_joint', openl_joint) + self._logger.video('train_openl_main', openl_main) + self._logger.video('train_openl_disen', openl_disen) + self._logger.video('train_openl_mask', openl_mask) + self._logger.write(fps=True) + action, state = self._policy(obs, state, training) + if training: + self._step.assign_add(len(reset)) + self._logger.step = self._config.action_repeat \ + * self._step.numpy().item() + return action, state + + @tf.function + def _policy(self, obs, state, training): + if state is None: + batch_size = len(obs['image']) + latent = self._wm.dynamics.initial(len(obs['image'])) + action = tf.zeros((batch_size, self._config.num_actions), self._float) + else: + latent, action = state + embed = self._wm.encoder(self._wm.preprocess(obs)) + latent, _ = self._wm.dynamics.obs_step( + latent, action, embed, self._config.collect_dyn_sample) + if self._config.eval_state_mean: + latent['stoch'] = latent['mean'] + feat = self._wm.dynamics.get_feat(latent) + if not training: + action = self._task_behavior.actor(feat).mode() + elif self._should_expl(self._step): + action = self._expl_behavior.actor(feat).sample() + else: + action = self._task_behavior.actor(feat).sample() + if self._config.actor_dist == 'onehot_gumble': + action = tf.cast( + tf.one_hot(tf.argmax(action, axis=-1), self._config.num_actions), + action.dtype) + action = self._exploration(action, training) + state = (latent, action) + return action, state + + def _exploration(self, action, training): + amount = self._config.expl_amount if training else self._config.eval_noise + if amount == 0: + return action + amount = tf.cast(amount, self._float) + if 'onehot' in self._config.actor_dist: + probs = amount / self._config.num_actions + (1 - amount) * action + return tools.OneHotDist(probs=probs).sample() + else: + return tf.clip_by_value(tfd.Normal(action, amount).sample(), -1, 1) + raise NotImplementedError(self._config.action_noise) + + @tf.function + def _train(self, data): + print('Tracing train function.') + metrics = {} + embed, post, feat, kl, mets = self._wm.train(data) + metrics.update(mets) + start = post + if self._config.pred_discount: # Last step could be terminal. + start = {k: v[:, :-1] for k, v in post.items()} + embed, feat, kl = embed[:, :-1], feat[:, :-1], kl[:, :-1] + reward = lambda f, s, a: self._wm.heads['reward'](f).mode() + metrics.update(self._task_behavior.train(start, reward)[-1]) + if self._config.expl_behavior != 'greedy': + mets = self._expl_behavior.train(start, feat, embed, kl)[-1] + metrics.update({'expl_' + key: value for key, value in mets.items()}) + for name, value in metrics.items(): + self._metrics[name].update_state(value) + + +def count_steps(folder): + return sum(int(str(n).split('-')[-1][:-4]) - 1 for n in folder.glob('*.npz')) + + +def make_dataset(episodes, config): + example = episodes[next(iter(episodes.keys()))] + types = {k: v.dtype for k, v in example.items()} + shapes = {k: (None,) + v.shape[1:] for k, v in example.items()} + generator = lambda: tools.sample_episodes( + episodes, config.batch_length, config.oversample_ends) + dataset = tf.data.Dataset.from_generator(generator, types, shapes) + dataset = dataset.batch(config.batch_size, drop_remainder=True) + dataset = dataset.prefetch(10) + return dataset + + +def make_env(config, logger, mode, train_eps, eval_eps): + suite, task = config.task.split('_', 1) + if suite == 'dmc': + env = wrappers.DeepMindControl(task, config.action_repeat, config.size) + env = wrappers.NormalizeActions(env) + elif suite == 'atari': + env = wrappers.Atari( + task, config.action_repeat, config.size, + grayscale=config.atari_grayscale, + life_done=False and (mode == 'train'), + sticky_actions=True, + all_actions=True) + env = wrappers.OneHotAction(env) + else: + raise NotImplementedError(suite) + env = wrappers.TimeLimit(env, config.time_limit) + callbacks = [functools.partial( + process_episode, config, logger, mode, train_eps, eval_eps)] + env = wrappers.CollectDataset(env, callbacks) + env = wrappers.RewardObs(env) + return env + + +def process_episode(config, logger, mode, train_eps, eval_eps, episode): + directory = dict(train=config.traindir, eval=config.evaldir)[mode] + cache = dict(train=train_eps, eval=eval_eps)[mode] + filename = tools.save_episodes(directory, [episode])[0] + length = len(episode['reward']) - 1 + score = float(episode['reward'].astype(np.float64).sum()) + video = episode['image'] + if mode == 'eval': + cache.clear() + if mode == 'train' and config.dataset_size: + total = 0 + for key, ep in reversed(sorted(cache.items(), key=lambda x: x[0])): + if total <= config.dataset_size - length: + total += len(ep['reward']) - 1 + else: + del cache[key] + logger.scalar('dataset_size', total + length) + cache[str(filename)] = episode + print(f'{mode.title()} episode has {length} steps and return {score:.1f}.') + logger.scalar(f'{mode}_return', score) + logger.scalar(f'{mode}_length', length) + logger.scalar(f'{mode}_episodes', len(cache)) + if mode == 'eval' or config.expl_gifs: + logger.video(f'{mode}_policy', video[None]) + logger.write() + + +def main(logdir, config): + + logdir = os.path.join( + logdir, config.task, 'Ours', str(config.seed)) + + logdir = pathlib.Path(logdir).expanduser() + config.traindir = config.traindir or logdir / 'train_eps' + config.evaldir = config.evaldir or logdir / 'eval_eps' + config.steps //= config.action_repeat + config.eval_every //= config.action_repeat + config.log_every //= config.action_repeat + config.time_limit //= config.action_repeat + config.act = getattr(tf.nn, config.act) + + if config.debug: + tf.config.experimental_run_functions_eagerly(True) + if config.gpu_growth: + message = 'No GPU found. To actually train on CPU remove this assert.' + assert tf.config.experimental.list_physical_devices('GPU'), message + for gpu in tf.config.experimental.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(gpu, True) + assert config.precision in (16, 32), config.precision + if config.precision == 16: + prec.set_policy(prec.Policy('mixed_float16')) + print('Logdir', logdir) + logdir.mkdir(parents=True, exist_ok=True) + config.traindir.mkdir(parents=True, exist_ok=True) + config.evaldir.mkdir(parents=True, exist_ok=True) + step = count_steps(config.traindir) + logger = tools.Logger(logdir, config.action_repeat * step) + + print('Create envs.') + if config.offline_traindir: + directory = config.offline_traindir.format(**vars(config)) + else: + directory = config.traindir + train_eps = tools.load_episodes(directory, limit=config.dataset_size) + if config.offline_evaldir: + directory = config.offline_evaldir.format(**vars(config)) + else: + directory = config.evaldir + eval_eps = tools.load_episodes(directory, limit=1) + make = lambda mode: make_env(config, logger, mode, train_eps, eval_eps) + train_envs = [make('train') for _ in range(config.envs)] + eval_envs = [make('eval') for _ in range(config.envs)] + acts = train_envs[0].action_space + config.num_actions = acts.n if hasattr(acts, 'n') else acts.shape[0] + + prefill = max(0, config.prefill - count_steps(config.traindir)) + print(f'Prefill dataset ({prefill} steps).') + random_agent = lambda o, d, s: ([acts.sample() for _ in d], s) + tools.simulate(random_agent, train_envs, prefill) + tools.simulate(random_agent, eval_envs, episodes=1) + logger.step = config.action_repeat * count_steps(config.traindir) + + print('Simulate agent.') + train_dataset = make_dataset(train_eps, config) + eval_dataset = iter(make_dataset(eval_eps, config)) + agent = Dreamer(config, logger, train_dataset) + if (logdir / 'variables.pkl').exists(): + agent.load(logdir / 'variables.pkl') + agent._should_pretrain._once = False + + state = None + suite, task = config.task.split('_', 1) + num_eval_episodes = 10 if suite == 'procgen' else 1 + while agent._step.numpy().item() < config.steps: + logger.write() + print('Start evaluation.') + openl_joint, openl_main, openl_disen, openl_mask = agent._wm.video_pred(next(eval_dataset)) + logger.video('eval_openl_joint', openl_joint) + logger.video('eval_openl_main', openl_main) + logger.video('eval_openl_disen', openl_disen) + logger.video('eval_openl_mask', openl_mask) + eval_policy = functools.partial(agent, training=False) + tools.simulate(eval_policy, eval_envs, episodes=num_eval_episodes) + print('Start training.') + state = tools.simulate(agent, train_envs, config.eval_every, state=state) + agent.save(logdir / 'variables.pkl') + for env in train_envs + eval_envs: + try: + env.close() + except Exception: + pass + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--configs', nargs='+', required=True) + args, remaining = parser.parse_known_args() + configs = yaml.safe_load((pathlib.Path(__file__).parent / 'configs.yaml').read_text()) + config_ = {} + for name in args.configs: + config_.update(configs[name]) + parser = argparse.ArgumentParser() + for key, value in config_.items(): + arg_type = tools.args_type(value) + parser.add_argument(f'--{key}', type=arg_type, default=arg_type(value)) + main(config_['logdir'], parser.parse_args(remaining)) diff --git a/DreamerV2/exploration.py b/DreamerV2/exploration.py new file mode 100644 index 0000000..4425ba5 --- /dev/null +++ b/DreamerV2/exploration.py @@ -0,0 +1,83 @@ +import tensorflow as tf +from tensorflow.keras.mixed_precision import experimental as prec +from tensorflow_probability import distributions as tfd + +import models +import networks +import tools + +class Random(tools.Module): + + def __init__(self, config): + self._config = config + self._float = prec.global_policy().comput_dtype + + def actor(self, feat): + shape = feat.shape[:-1] + [self._config.num_actions] + if self._config.actor_dist == 'onehot': + return tools.OneHotDist(tf.zeros(shape)) + else: + ones = tf.ones(shape, self._float) + return tfd.Uniform(-ones, ones) + + def train(self, start, feat, embed, kl): + return None, {} + + +class Plan2Explore(tools.Module): + + def __init__(self, config, world_model, reward=None): + self._config = config + self._reward = reward + self._behavior = models.ImagBehavior(config, world_model) + self.actor = self._behavior.actor + size = { + 'embed': 32 * config.cnn_depth, + 'stoch': config.dyn_stoch, + 'deter': config.dyn_deter, + 'feat': config.dyn_stoch + config.dyn_deter, + }[self._config.disag_target] + kw = dict( + shape=size, layers=config.disag_layers, units=config.disag_units, + act=config.act) + self._networks = [ + networks.DenseHead(**kw) for _ in range(config.disag_models)] + self._opt = tools.Optimizer( + 'ensemble', config.model_lr, config.opt_eps, config.grad_clip, + config.weight_decay, opt=config.opt) + + def train(self, start, feat, embed, kl): + metrics = {} + target = { + 'embed': embed, + 'stoch': start['stoch'], + 'deter': start['deter'], + 'feat': feat, + }[self._config.disag_target] + metrics.update(self._train_ensemble(feat, target)) + metrics.update(self._behavior.train(start, self._intrinsic_reward)[-1]) + return None, metrics + + def _intrinsic_reward(self, feat, state, action): + preds = [head(feat, tf.float32).mean() for head in self._networks] + disag = tf.reduce_mean(tf.math.reduce_std(preds, 0), -1) + if self._config.disag_log: + disag = tf.math.log(disag) + reward = self._config.expl_intr_scale * disag + if self._config.expl_extr_scale: + reward += tf.cast(self._config.expl_extr_scale * self._reward( + feat, state, action), tf.float32) + return reward + + def _train_ensemble(self, inputs, targets): + if self._config.disag_offset: + targets = targets[:, self._config.disag_offset:] + inputs = inputs[:, :-self._config.disag_offset] + targets = tf.stop_gradient(targets) + inputs = tf.stop_gradient(inputs) + with tf.GradientTape() as tape: + preds = [head(inputs) for head in self._networks] + likes = [tf.reduce_mean(pred.log_prob(targets)) for pred in preds] + loss = -tf.cast(tf.reduce_sum(likes), tf.float32) + metrics = self._opt(tape, loss, self._networks) + return metrics diff --git a/DreamerV2/models.py b/DreamerV2/models.py new file mode 100644 index 0000000..8225adb --- /dev/null +++ b/DreamerV2/models.py @@ -0,0 +1,429 @@ +import tensorflow as tf +from tensorflow.keras.mixed_precision import experimental as prec + +import networks +import tools + + +class WorldModel(tools.Module): + + def __init__(self, step, config): + self._step = step + self._config = config + channels = (1 if config.atari_grayscale else 3) + shape = config.size + (channels,) + + ######## + # Main # + ######## + self.encoder = networks.ConvEncoder( + config.cnn_depth, config.act, config.encoder_kernels) + self.dynamics = networks.RSSM( + config.dyn_stoch, config.dyn_deter, config.dyn_hidden, + config.dyn_input_layers, config.dyn_output_layers, config.dyn_shared, + config.dyn_discrete, config.act, config.dyn_mean_act, + config.dyn_std_act, config.dyn_min_std, config.dyn_cell) + self.heads = {} + self.heads['reward'] = networks.DenseHead( + [], config.reward_layers, config.units, config.act) + if config.pred_discount: + self.heads['discount'] = networks.DenseHead( + [], config.discount_layers, config.units, config.act, dist='binary') + self._model_opt = tools.Optimizer( + 'model', config.model_lr, config.opt_eps, config.grad_clip, + config.weight_decay, opt=config.opt) + self._scales = dict( + reward=config.reward_scale, discount=config.discount_scale) + + ######### + # Disen # + ######### + self.disen_encoder = networks.ConvEncoder( + config.disen_cnn_depth, config.act, config.encoder_kernels) + self.disen_dynamics = networks.RSSM( + config.dyn_stoch, config.dyn_deter, config.dyn_hidden, + config.dyn_input_layers, config.dyn_output_layers, config.dyn_shared, + config.dyn_discrete, config.act, config.dyn_mean_act, + config.dyn_std_act, config.dyn_min_std, config.dyn_cell) + + self.disen_heads = {} + self.disen_heads['reward'] = networks.DenseHead( + [], config.reward_layers, config.units, config.act) + if config.pred_discount: + self.disen_heads['discount'] = networks.DenseHead( + [], config.discount_layers, config.units, config.act, dist='binary') + + self._disen_model_opt = tools.Optimizer( + 'disen', config.model_lr, config.opt_eps, config.grad_clip, + config.weight_decay, opt=config.opt) + + self._disen_heads_opt = {} + self._disen_heads_opt['reward'] = tools.Optimizer( + 'disen_reward', config.model_lr, config.opt_eps, config.grad_clip, + config.weight_decay, opt=config.opt) + if config.pred_discount: + self._disen_heads_opt['discount'] = tools.Optimizer( + 'disen_pcont', config.model_lr, config.opt_eps, config.grad_clip, + config.weight_decay, opt=config.opt) + + # negative signs for reward/discount here + self._disen_scales = dict(disen_only=config.disen_only_scale, + reward=-config.disen_reward_scale, discount=-config.disen_discount_scale) + + self.disen_only_image_head = networks.ConvDecoder( + config.disen_cnn_depth, config.act, shape, config.decoder_kernels, + config.decoder_thin) + + ################ + # Joint Decode # + ################ + self.image_head = networks.ConvDecoderMask( + config.cnn_depth, config.act, shape, config.decoder_kernels, + config.decoder_thin) + self.disen_image_head = networks.ConvDecoderMask( + config.disen_cnn_depth, config.act, shape, config.decoder_kernels, + config.decoder_thin) + self.joint_image_head = networks.ConvDecoderMaskEnsemble( + self.image_head, self.disen_image_head + ) + + def train(self, data): + data = self.preprocess(data) + with tf.GradientTape() as model_tape, tf.GradientTape() as disen_tape: + + # kl schedule + kl_balance = tools.schedule(self._config.kl_balance, self._step) + kl_free = tools.schedule(self._config.kl_free, self._step) + kl_scale = tools.schedule(self._config.kl_scale, self._step) + + # Main + embed = self.encoder(data) + post, prior = self.dynamics.observe(embed, data['action']) + kl_loss, kl_value = self.dynamics.kl_loss( + post, prior, kl_balance, kl_free, kl_scale) + feat = self.dynamics.get_feat(post) + likes = {} + for name, head in self.heads.items(): + grad_head = (name in self._config.grad_heads) + inp = feat if grad_head else tf.stop_gradient(feat) + pred = head(inp, tf.float32) + like = pred.log_prob(tf.cast(data[name], tf.float32)) + likes[name] = tf.reduce_mean( + like) * self._scales.get(name, 1.0) + + # Disen + embed_disen = self.disen_encoder(data) + post_disen, prior_disen = self.disen_dynamics.observe( + embed_disen, data['action']) + kl_loss_disen, kl_value_disen = self.dynamics.kl_loss( + post_disen, prior_disen, kl_balance, kl_free, kl_scale) + feat_disen = self.disen_dynamics.get_feat(post_disen) + + # Optimize disen reward/pcont till optimal + disen_metrics = dict(reward={}, discount={}) + loss_disen = dict(reward=None, discount=None) + for _ in range(self._config.num_reward_opt_iters): + with tf.GradientTape() as disen_reward_tape, tf.GradientTape() as disen_pcont_tape: + disen_gradient_tapes = dict( + reward=disen_reward_tape, discount=disen_pcont_tape) + for name, head in self.disen_heads.items(): + pred_disen = head( + tf.stop_gradient(feat_disen), tf.float32) + loss_disen[name] = -tf.reduce_mean(pred_disen.log_prob( + tf.cast(data[name], tf.float32))) + for name, head in self.disen_heads.items(): + disen_metrics[name] = self._disen_heads_opt[name]( + disen_gradient_tapes[name], loss_disen[name], [head], prefix='disen_neg') + + # Compute likes for disen model (including negative gradients) + likes_disen = {} + for name, head in self.disen_heads.items(): + pred_disen = head(feat_disen, tf.float32) + like_disen = pred_disen.log_prob( + tf.cast(data[name], tf.float32)) + likes_disen[name] = tf.reduce_mean( + like_disen) * self._disen_scales.get(name, -1.0) + disen_only_image_pred = self.disen_only_image_head( + feat_disen, tf.float32) + disen_only_image_like = tf.reduce_mean(disen_only_image_pred.log_prob( + tf.cast(data['image'], tf.float32))) * self._disen_scales.get('disen_only', 1.0) + likes_disen['disen_only'] = disen_only_image_like + + # Joint decode + image_pred_joint, _, _, _ = self.joint_image_head( + feat, feat_disen, tf.float32) + image_like = tf.reduce_mean(image_pred_joint.log_prob( + tf.cast(data['image'], tf.float32))) + likes['image'] = image_like + likes_disen['image'] = image_like + + # Compute loss + model_loss = kl_loss - sum(likes.values()) + disen_loss = kl_loss_disen - sum(likes_disen.values()) + + model_parts = [self.encoder, self.dynamics, + self.joint_image_head] + list(self.heads.values()) + disen_parts = [self.disen_encoder, self.disen_dynamics, + self.joint_image_head, self.disen_only_image_head] + + metrics = self._model_opt( + model_tape, model_loss, model_parts, prefix='main') + disen_model_metrics = self._disen_model_opt( + disen_tape, disen_loss, disen_parts, prefix='disen') + + metrics['kl_balance'] = kl_balance + metrics['kl_free'] = kl_free + metrics['kl_scale'] = kl_scale + metrics.update({f'{name}_loss': -like for name, + like in likes.items()}) + + metrics['disen/disen_only_image_loss'] = -disen_only_image_like + metrics['disen/disen_reward_loss'] = -likes_disen['reward'] / \ + self._disen_scales.get('reward', -1.0) + metrics['disen/disen_discount_loss'] = -likes_disen['discount'] / \ + self._disen_scales.get('discount', -1.0) + + metrics['kl'] = tf.reduce_mean(kl_value) + metrics['prior_ent'] = self.dynamics.get_dist(prior).entropy() + metrics['post_ent'] = self.dynamics.get_dist(post).entropy() + metrics['disen/kl'] = tf.reduce_mean(kl_value_disen) + metrics['disen/prior_ent'] = self.dynamics.get_dist( + prior_disen).entropy() + metrics['disen/post_ent'] = self.dynamics.get_dist( + post_disen).entropy() + + metrics.update( + {f'{key}': value for key, value in disen_metrics['reward'].items()}) + metrics.update( + {f'{key}': value for key, value in disen_metrics['discount'].items()}) + metrics.update( + {f'{key}': value for key, value in disen_model_metrics.items()}) + + return embed, post, feat, kl_value, metrics + + @tf.function + def preprocess(self, obs): + dtype = prec.global_policy().compute_dtype + obs = obs.copy() + obs['image'] = tf.cast(obs['image'], dtype) / 255.0 - 0.5 + obs['reward'] = getattr(tf, self._config.clip_rewards)(obs['reward']) + if 'discount' in obs: + obs['discount'] *= self._config.discount + for key, value in obs.items(): + if tf.dtypes.as_dtype(value.dtype) in ( + tf.float16, tf.float32, tf.float64): + obs[key] = tf.cast(value, dtype) + return obs + + @tf.function + def video_pred(self, data): + data = self.preprocess(data) + truth = data['image'][:6] + 0.5 + + embed = self.encoder(data) + embed_disen = self.disen_encoder(data) + states, _ = self.dynamics.observe( + embed[:6, :5], data['action'][:6, :5]) + states_disen, _ = self.disen_dynamics.observe( + embed_disen[:6, :5], data['action'][:6, :5]) + feats = self.dynamics.get_feat(states) + feats_disen = self.disen_dynamics.get_feat(states_disen) + recon_joint, recon_main, recon_disen, recon_mask = self.joint_image_head( + feats, feats_disen) + recon_joint = recon_joint.mode()[:6] + recon_main = recon_main.mode()[:6] + recon_disen = recon_disen.mode()[:6] + recon_mask = recon_mask[:6] + + init = {k: v[:, -1] for k, v in states.items()} + init_disen = {k: v[:, -1] for k, v in states_disen.items()} + prior = self.dynamics.imagine( + data['action'][:6, 5:], init) + prior_disen = self.disen_dynamics.imagine( + data['action'][:6, 5:], init_disen) + _feats = self.dynamics.get_feat(prior) + _feats_disen = self.disen_dynamics.get_feat(prior_disen) + openl_joint, openl_main, openl_disen, openl_mask = self.joint_image_head( + _feats, _feats_disen) + openl_joint = openl_joint.mode() + openl_main = openl_main.mode() + openl_disen = openl_disen.mode() + + model_joint = tf.concat( + [recon_joint[:, :5] + 0.5, openl_joint + 0.5], 1) + error_joint = (model_joint - truth + 1) / 2 + model_main = tf.concat( + [recon_main[:, :5] + 0.5, openl_main + 0.5], 1) + error_main = (model_main - truth + 1) / 2 + model_disen = tf.concat( + [recon_disen[:, :5] + 0.5, openl_disen + 0.5], 1) + error_disen = (model_disen - truth + 1) / 2 + model_mask = tf.concat( + [recon_mask[:, :5] + 0.5, openl_mask + 0.5], 1) + + output_joint = tf.concat([truth, model_joint, error_joint], 2) + output_main = tf.concat([truth, model_main, error_main], 2) + output_disen = tf.concat([truth, model_disen, error_disen], 2) + output_mask = model_mask + + return output_joint, output_main, output_disen, output_mask + + +class ImagBehavior(tools.Module): + + def __init__(self, config, world_model, stop_grad_actor=True, reward=None): + self._config = config + self._world_model = world_model + self._stop_grad_actor = stop_grad_actor + self._reward = reward + self.actor = networks.ActionHead( + config.num_actions, config.actor_layers, config.units, config.act, + config.actor_dist, config.actor_init_std, config.actor_min_std, + config.actor_dist, config.actor_temp, config.actor_outscale) + self.value = networks.DenseHead( + [], config.value_layers, config.units, config.act, + config.value_head) + if config.slow_value_target or config.slow_actor_target: + self._slow_value = networks.DenseHead( + [], config.value_layers, config.units, config.act) + self._updates = tf.Variable(0, tf.int64) + kw = dict(wd=config.weight_decay, opt=config.opt) + self._actor_opt = tools.Optimizer( + 'actor', config.actor_lr, config.opt_eps, config.actor_grad_clip, **kw) + self._value_opt = tools.Optimizer( + 'value', config.value_lr, config.opt_eps, config.value_grad_clip, **kw) + + def train( + self, start, objective=None, imagine=None, tape=None, repeats=None): + objective = objective or self._reward + self._update_slow_target() + metrics = {} + with (tape or tf.GradientTape()) as actor_tape: + assert bool(objective) != bool(imagine) + if objective: + imag_feat, imag_state, imag_action = self._imagine( + start, self.actor, self._config.imag_horizon, repeats) + reward = objective(imag_feat, imag_state, imag_action) + else: + imag_feat, imag_state, imag_action, reward = imagine(start) + actor_ent = self.actor(imag_feat, tf.float32).entropy() + state_ent = self._world_model.dynamics.get_dist( + imag_state, tf.float32).entropy() + target, weights = self._compute_target( + imag_feat, reward, actor_ent, state_ent, + self._config.slow_actor_target) + actor_loss, mets = self._compute_actor_loss( + imag_feat, imag_state, imag_action, target, actor_ent, state_ent, + weights) + metrics.update(mets) + if self._config.slow_value_target != self._config.slow_actor_target: + target, weights = self._compute_target( + imag_feat, reward, actor_ent, state_ent, + self._config.slow_value_target) + with tf.GradientTape() as value_tape: + value = self.value(imag_feat, tf.float32)[:-1] + value_loss = -value.log_prob(tf.stop_gradient(target)) + if self._config.value_decay: + value_loss += self._config.value_decay * value.mode() + value_loss = tf.reduce_mean(weights[:-1] * value_loss) + metrics['reward_mean'] = tf.reduce_mean(reward) + metrics['reward_std'] = tf.math.reduce_std(reward) + metrics['actor_ent'] = tf.reduce_mean(actor_ent) + metrics.update(self._actor_opt(actor_tape, actor_loss, [self.actor])) + metrics.update(self._value_opt(value_tape, value_loss, [self.value])) + return imag_feat, imag_state, imag_action, weights, metrics + + def _imagine(self, start, policy, horizon, repeats=None): + dynamics = self._world_model.dynamics + if repeats: + start = {k: tf.repeat(v, repeats, axis=1) + for k, v in start.items()} + + def flatten(x): return tf.reshape(x, [-1] + list(x.shape[2:])) + start = {k: flatten(v) for k, v in start.items()} + + def step(prev, _): + state, _, _ = prev + feat = dynamics.get_feat(state) + inp = tf.stop_gradient(feat) if self._stop_grad_actor else feat + action = policy(inp).sample() + succ = dynamics.img_step( + state, action, sample=self._config.imag_sample) + return succ, feat, action + feat = 0 * dynamics.get_feat(start) + action = policy(feat).mode() + succ, feats, actions = tools.static_scan( + step, tf.range(horizon), (start, feat, action)) + states = {k: tf.concat([ + start[k][None], v[:-1]], 0) for k, v in succ.items()} + if repeats: + def unfold(tensor): + s = tensor.shape + return tf.reshape(tensor, [s[0], s[1] // repeats, repeats] + s[2:]) + states, feats, actions = tf.nest.map_structure( + unfold, (states, feats, actions)) + return feats, states, actions + + def _compute_target(self, imag_feat, reward, actor_ent, state_ent, slow): + reward = tf.cast(reward, tf.float32) + if 'discount' in self._world_model.heads: + discount = self._world_model.heads['discount']( + imag_feat, tf.float32).mean() + else: + discount = self._config.discount * tf.ones_like(reward) + if self._config.future_entropy and tf.greater( + self._config.actor_entropy(), 0): + reward += self._config.actor_entropy() * actor_ent + if self._config.future_entropy and tf.greater( + self._config.actor_state_entropy(), 0): + reward += self._config.actor_state_entropy() * state_ent + if slow: + value = self._slow_value(imag_feat, tf.float32).mode() + else: + value = self.value(imag_feat, tf.float32).mode() + target = tools.lambda_return( + reward[:-1], value[:-1], discount[:-1], + bootstrap=value[-1], lambda_=self._config.discount_lambda, axis=0) + weights = tf.stop_gradient(tf.math.cumprod(tf.concat( + [tf.ones_like(discount[:1]), discount[:-1]], 0), 0)) + return target, weights + + def _compute_actor_loss( + self, imag_feat, imag_state, imag_action, target, actor_ent, state_ent, + weights): + metrics = {} + inp = tf.stop_gradient( + imag_feat) if self._stop_grad_actor else imag_feat + policy = self.actor(inp, tf.float32) + actor_ent = policy.entropy() + if self._config.imag_gradient == 'dynamics': + actor_target = target + elif self._config.imag_gradient == 'reinforce': + imag_action = tf.cast(imag_action, tf.float32) + actor_target = policy.log_prob(imag_action)[:-1] * tf.stop_gradient( + target - self.value(imag_feat[:-1], tf.float32).mode()) + elif self._config.imag_gradient == 'both': + imag_action = tf.cast(imag_action, tf.float32) + actor_target = policy.log_prob(imag_action)[:-1] * tf.stop_gradient( + target - self.value(imag_feat[:-1], tf.float32).mode()) + mix = self._config.imag_gradient_mix() + actor_target = mix * target + (1 - mix) * actor_target + metrics['imag_gradient_mix'] = mix + else: + raise NotImplementedError(self._config.imag_gradient) + if not self._config.future_entropy and tf.greater( + self._config.actor_entropy(), 0): + actor_target += self._config.actor_entropy() * actor_ent[:-1] + if not self._config.future_entropy and tf.greater( + self._config.actor_state_entropy(), 0): + actor_target += self._config.actor_state_entropy() * state_ent[:-1] + actor_loss = -tf.reduce_mean(weights[:-1] * actor_target) + return actor_loss, metrics + + def _update_slow_target(self): + if self._config.slow_value_target or self._config.slow_actor_target: + if self._updates % self._config.slow_target_update == 0: + mix = self._config.slow_target_fraction + for s, d in zip(self.value.variables, self._slow_value.variables): + d.assign(mix * s + (1 - mix) * d) + self._updates.assign_add(1) diff --git a/DreamerV2/networks.py b/DreamerV2/networks.py new file mode 100644 index 0000000..7a65d15 --- /dev/null +++ b/DreamerV2/networks.py @@ -0,0 +1,465 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras import layers as tfkl +from tensorflow_probability import distributions as tfd +from tensorflow.keras.mixed_precision import experimental as prec + +import tools + +class RSSM(tools.Module): + + def __init__( + self, stoch=30, deter=200, hidden=200, layers_input=1, layers_output=1, + shared=False, discrete=False, act=tf.nn.elu, mean_act='none', + std_act='softplus', min_std=0.1, cell='keras'): + super().__init__() + self._stoch = stoch + self._deter = deter + self._hidden = hidden + self._min_std = min_std + self._layers_input = layers_input + self._layers_output = layers_output + self._shared = shared + self._discrete = discrete + self._act = act + self._mean_act = mean_act + self._std_act = std_act + self._embed = None + if cell == 'gru': + self._cell = tfkl.GRUCell(self._deter) + elif cell == 'gru_layer_norm': + self._cell = GRUCell(self._deter, norm=True) + else: + raise NotImplementedError(cell) + + def initial(self, batch_size): + dtype = prec.global_policy().compute_dtype + if self._discrete: + state = dict( + logit=tf.zeros( + [batch_size, self._stoch, self._discrete], dtype), + stoch=tf.zeros( + [batch_size, self._stoch, self._discrete], dtype), + deter=self._cell.get_initial_state(None, batch_size, dtype)) + else: + state = dict( + mean=tf.zeros([batch_size, self._stoch], dtype), + std=tf.zeros([batch_size, self._stoch], dtype), + stoch=tf.zeros([batch_size, self._stoch], dtype), + deter=self._cell.get_initial_state(None, batch_size, dtype)) + return state + + @tf.function + def observe(self, embed, action, state=None): + def swap(x): return tf.transpose( + x, [1, 0] + list(range(2, len(x.shape)))) + if state is None: + state = self.initial(tf.shape(action)[0]) + embed, action = swap(embed), swap(action) + post, prior = tools.static_scan( + lambda prev, inputs: self.obs_step(prev[0], *inputs), + (action, embed), (state, state)) + post = {k: swap(v) for k, v in post.items()} + prior = {k: swap(v) for k, v in prior.items()} + return post, prior + + @tf.function + def imagine(self, action, state=None): + def swap(x): return tf.transpose( + x, [1, 0] + list(range(2, len(x.shape)))) + if state is None: + state = self.initial(tf.shape(action)[0]) + assert isinstance(state, dict), state + action = swap(action) + prior = tools.static_scan(self.img_step, action, state) + prior = {k: swap(v) for k, v in prior.items()} + return prior + + def get_feat(self, state): + stoch = state['stoch'] + if self._discrete: + shape = stoch.shape[:-2] + [self._stoch * self._discrete] + stoch = tf.reshape(stoch, shape) + return tf.concat([stoch, state['deter']], -1) + + def get_dist(self, state, dtype=None): + if self._discrete: + logit = state['logit'] + logit = tf.cast(logit, tf.float32) + dist = tfd.Independent(tools.OneHotDist(logit), 1) + if dtype != tf.float32: + dist = tools.DtypeDist(dist, dtype or state['logit'].dtype) + else: + mean, std = state['mean'], state['std'] + if dtype: + mean = tf.cast(mean, dtype) + std = tf.cast(std, dtype) + dist = tfd.MultivariateNormalDiag(mean, std) + return dist + + @tf.function + def obs_step(self, prev_state, prev_action, embed, sample=True): + if not self._embed: + self._embed = embed.shape[-1] + prior = self.img_step(prev_state, prev_action, None, sample) + if self._shared: + post = self.img_step(prev_state, prev_action, embed, sample) + else: + x = tf.concat([prior['deter'], embed], -1) + for i in range(self._layers_output): + x = self.get(f'obi{i}', tfkl.Dense, self._hidden, self._act)(x) + stats = self._suff_stats_layer('obs', x) + if sample: + stoch = self.get_dist(stats).sample() + else: + stoch = self.get_dist(stats).mode() + post = {'stoch': stoch, 'deter': prior['deter'], **stats} + return post, prior + + @tf.function + def img_step(self, prev_state, prev_action, embed=None, sample=True): + prev_stoch = prev_state['stoch'] + if self._discrete: + shape = prev_stoch.shape[:-2] + [self._stoch * self._discrete] + prev_stoch = tf.reshape(prev_stoch, shape) + if self._shared: + if embed is None: + shape = prev_action.shape[:-1] + [self._embed] + embed = tf.zeros(shape, prev_action.dtype) + x = tf.concat([prev_stoch, prev_action, embed], -1) + else: + x = tf.concat([prev_stoch, prev_action], -1) + for i in range(self._layers_input): + x = self.get(f'ini{i}', tfkl.Dense, self._hidden, self._act)(x) + x, deter = self._cell(x, [prev_state['deter']]) + deter = deter[0] # Keras wraps the state in a list. + for i in range(self._layers_output): + x = self.get(f'imo{i}', tfkl.Dense, self._hidden, self._act)(x) + stats = self._suff_stats_layer('ims', x) + if sample: + stoch = self.get_dist(stats).sample() + else: + stoch = self.get_dist(stats).mode() + prior = {'stoch': stoch, 'deter': deter, **stats} + return prior + + def _suff_stats_layer(self, name, x): + if self._discrete: + x = self.get(name, tfkl.Dense, self._stoch * + self._discrete, None)(x) + logit = tf.reshape(x, x.shape[:-1] + [self._stoch, self._discrete]) + return {'logit': logit} + else: + x = self.get(name, tfkl.Dense, 2 * self._stoch, None)(x) + mean, std = tf.split(x, 2, -1) + mean = { + 'none': lambda: mean, + 'tanh5': lambda: 5.0 * tf.math.tanh(mean / 5.0), + }[self._mean_act]() + std = { + 'softplus': lambda: tf.nn.softplus(std), + 'abs': lambda: tf.math.abs(std + 1), + 'sigmoid': lambda: tf.nn.sigmoid(std), + 'sigmoid2': lambda: 2 * tf.nn.sigmoid(std / 2), + }[self._std_act]() + std = std + self._min_std + return {'mean': mean, 'std': std} + + def kl_loss(self, post, prior, balance, free, scale): + kld = tfd.kl_divergence + def dist(x): return self.get_dist(x, tf.float32) + if balance == 0.5: + value = kld(dist(prior), dist(post)) + loss = tf.reduce_mean(tf.maximum(value, free)) + else: + def sg(x): return tf.nest.map_structure(tf.stop_gradient, x) + value = kld(dist(prior), dist(sg(post))) + pri = tf.reduce_mean(value) + pos = tf.reduce_mean(kld(dist(sg(prior)), dist(post))) + pri, pos = tf.maximum(pri, free), tf.maximum(pos, free) + loss = balance * pri + (1 - balance) * pos + loss *= scale + return loss, value + + +class ConvEncoder(tools.Module): + + def __init__( + self, depth=32, act=tf.nn.relu, kernels=(4, 4, 4, 4)): + self._act = act + self._depth = depth + self._kernels = kernels + + def __call__(self, obs): + kwargs = dict(strides=2, activation=self._act) + Conv = tfkl.Conv2D + x = tf.reshape(obs['image'], (-1,) + tuple(obs['image'].shape[-3:])) + x = self.get('h1', Conv, 1 * self._depth, + self._kernels[0], **kwargs)(x) + x = self.get('h2', Conv, 2 * self._depth, + self._kernels[1], **kwargs)(x) + x = self.get('h3', Conv, 4 * self._depth, + self._kernels[2], **kwargs)(x) + x = self.get('h4', Conv, 8 * self._depth, + self._kernels[3], **kwargs)(x) + x = tf.reshape(x, [x.shape[0], np.prod(x.shape[1:])]) + shape = tf.concat([tf.shape(obs['image'])[:-3], [x.shape[-1]]], 0) + return tf.reshape(x, shape) + + +class ConvDecoder(tools.Module): + + def __init__( + self, depth=32, act=tf.nn.relu, shape=(64, 64, 3), kernels=(5, 5, 6, 6), + thin=True): + self._act = act + self._depth = depth + self._shape = shape + self._kernels = kernels + self._thin = thin + + def __call__(self, features, dtype=None): + kwargs = dict(strides=2, activation=self._act) + ConvT = tfkl.Conv2DTranspose + if self._thin: + x = self.get('h1', tfkl.Dense, 32 * self._depth, None)(features) + x = tf.reshape(x, [-1, 1, 1, 32 * self._depth]) + else: + x = self.get('h1', tfkl.Dense, 128 * self._depth, None)(features) + x = tf.reshape(x, [-1, 2, 2, 32 * self._depth]) + x = self.get('h2', ConvT, 4 * self._depth, + self._kernels[0], **kwargs)(x) + x = self.get('h3', ConvT, 2 * self._depth, + self._kernels[1], **kwargs)(x) + x = self.get('h4', ConvT, 1 * self._depth, + self._kernels[2], **kwargs)(x) + x = self.get( + 'h5', ConvT, self._shape[-1], self._kernels[3], strides=2)(x) + mean = tf.reshape(x, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + if dtype: + mean = tf.cast(mean, dtype) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)) + + +class ConvDecoderMask(tools.Module): + + def __init__( + self, depth=32, act=tf.nn.relu, shape=(64, 64, 3), kernels=(5, 5, 6, 6), + thin=True): + self._act = act + self._depth = depth + self._shape = shape + self._kernels = kernels + self._thin = thin + + def __call__(self, features, dtype=None): + kwargs = dict(strides=2, activation=self._act) + ConvT = tfkl.Conv2DTranspose + if self._thin: + x = self.get('h1', tfkl.Dense, 32 * self._depth, None)(features) + x = tf.reshape(x, [-1, 1, 1, 32 * self._depth]) + else: + x = self.get('h1', tfkl.Dense, 128 * self._depth, None)(features) + x = tf.reshape(x, [-1, 2, 2, 32 * self._depth]) + x = self.get('h2', ConvT, 4 * self._depth, + self._kernels[0], **kwargs)(x) + x = self.get('h3', ConvT, 2 * self._depth, + self._kernels[1], **kwargs)(x) + x = self.get('h4', ConvT, 1 * self._depth, + self._kernels[2], **kwargs)(x) + x = self.get( + 'h5', ConvT, 2 * self._shape[-1], self._kernels[3], strides=2)(x) + mean, mask = tf.split(x, [self._shape[-1], self._shape[-1]], -1) + mean = tf.reshape(mean, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + mask = tf.reshape(mask, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + if dtype: + mean = tf.cast(mean, dtype) + mask = tf.cast(mask, dtype) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), mask + + +class ConvDecoderMaskEnsemble(tools.Module): + """ + ensemble two convdecoder with outputs + NOTE: remove pred1/pred2 for maximum performance. + """ + + def __init__(self, decoder1, decoder2): + self._decoder1 = decoder1 + self._decoder2 = decoder2 + self._shape = decoder1._shape + + def __call__(self, feat1, feat2, dtype=None): + kwargs = dict(strides=1, activation=tf.nn.sigmoid) + pred1, mask1 = self._decoder1(feat1, dtype) + pred2, mask2 = self._decoder2(feat2, dtype) + mean1 = pred1.submodules[0].loc + mean2 = pred2.submodules[0].loc + mask_feat = tf.concat([mask1, mask2], -1) + mask = self.get('mask1', tfkl.Conv2D, 1, 1, **kwargs)(mask_feat) + mask_use1 = mask + mask_use2 = 1-mask + mean = mean1 * tf.cast(mask_use1, mean1.dtype) + \ + mean2 * tf.cast(mask_use2, mean2.dtype) + return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), pred1, pred2, tf.cast(mask_use1, mean1.dtype) + + +class DenseHead(tools.Module): + + def __init__( + self, shape, layers, units, act=tf.nn.elu, dist='normal', std=1.0): + self._shape = (shape,) if isinstance(shape, int) else shape + self._layers = layers + self._units = units + self._act = act + self._dist = dist + self._std = std + + def __call__(self, features, dtype=None): + x = features + for index in range(self._layers): + x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x) + mean = self.get(f'hmean', tfkl.Dense, np.prod(self._shape))(x) + mean = tf.reshape(mean, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + if self._std == 'learned': + std = self.get(f'hstd', tfkl.Dense, np.prod(self._shape))(x) + std = tf.nn.softplus(std) + 0.01 + std = tf.reshape(std, tf.concat( + [tf.shape(features)[:-1], self._shape], 0)) + else: + std = self._std + if dtype: + mean, std = tf.cast(mean, dtype), tf.cast(std, dtype) + if self._dist == 'normal': + return tfd.Independent(tfd.Normal(mean, std), len(self._shape)) + if self._dist == 'huber': + return tfd.Independent( + tools.UnnormalizedHuber(mean, std, 1.0), len(self._shape)) + if self._dist == 'binary': + return tfd.Independent(tfd.Bernoulli(mean), len(self._shape)) + raise NotImplementedError(self._dist) + + +class ActionHead(tools.Module): + + def __init__( + self, size, layers, units, act=tf.nn.elu, dist='trunc_normal', + init_std=0.0, min_std=0.1, action_disc=5, temp=0.1, outscale=0): + # assert min_std <= 2 + self._size = size + self._layers = layers + self._units = units + self._dist = dist + self._act = act + self._min_std = min_std + self._init_std = init_std + self._action_disc = action_disc + self._temp = temp() if callable(temp) else temp + self._outscale = outscale + + def __call__(self, features, dtype=None): + x = features + for index in range(self._layers): + kw = {} + if index == self._layers - 1 and self._outscale: + kw['kernel_initializer'] = tf.keras.initializers.VarianceScaling( + self._outscale) + x = self.get(f'h{index}', tfkl.Dense, + self._units, self._act, **kw)(x) + if self._dist == 'tanh_normal': + # https://www.desmos.com/calculator/rcmcf5jwe7 + x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x) + if dtype: + x = tf.cast(x, dtype) + mean, std = tf.split(x, 2, -1) + mean = tf.tanh(mean) + std = tf.nn.softplus(std + self._init_std) + self._min_std + dist = tfd.Normal(mean, std) + dist = tfd.TransformedDistribution(dist, tools.TanhBijector()) + dist = tfd.Independent(dist, 1) + dist = tools.SampleDist(dist) + elif self._dist == 'tanh_normal_5': + x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x) + if dtype: + x = tf.cast(x, dtype) + mean, std = tf.split(x, 2, -1) + mean = 5 * tf.tanh(mean / 5) + std = tf.nn.softplus(std + 5) + 5 + dist = tfd.Normal(mean, std) + dist = tfd.TransformedDistribution(dist, tools.TanhBijector()) + dist = tfd.Independent(dist, 1) + dist = tools.SampleDist(dist) + elif self._dist == 'normal': + x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x) + if dtype: + x = tf.cast(x, dtype) + mean, std = tf.split(x, 2, -1) + std = tf.nn.softplus(std + self._init_std) + self._min_std + dist = tfd.Normal(mean, std) + dist = tfd.Independent(dist, 1) + elif self._dist == 'normal_1': + mean = self.get(f'hout', tfkl.Dense, self._size)(x) + if dtype: + mean = tf.cast(mean, dtype) + dist = tfd.Normal(mean, 1) + dist = tfd.Independent(dist, 1) + elif self._dist == 'trunc_normal': + # https://www.desmos.com/calculator/mmuvuhnyxo + x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x) + x = tf.cast(x, tf.float32) + mean, std = tf.split(x, 2, -1) + mean = tf.tanh(mean) + std = 2 * tf.nn.sigmoid(std / 2) + self._min_std + dist = tools.SafeTruncatedNormal(mean, std, -1, 1) + dist = tools.DtypeDist(dist, dtype) + dist = tfd.Independent(dist, 1) + elif self._dist == 'onehot': + x = self.get(f'hout', tfkl.Dense, self._size)(x) + x = tf.cast(x, tf.float32) + dist = tools.OneHotDist(x, dtype=dtype) + dist = tools.DtypeDist(dist, dtype) + elif self._dist == 'onehot_gumble': + x = self.get(f'hout', tfkl.Dense, self._size)(x) + if dtype: + x = tf.cast(x, dtype) + temp = self._temp + dist = tools.GumbleDist(temp, x, dtype=dtype) + else: + raise NotImplementedError(self._dist) + return dist + + +class GRUCell(tf.keras.layers.AbstractRNNCell): + + def __init__(self, size, norm=False, act=tf.tanh, update_bias=-1, **kwargs): + super().__init__() + self._size = size + self._act = act + self._norm = norm + self._update_bias = update_bias + self._layer = tfkl.Dense(3 * size, use_bias=norm is not None, **kwargs) + if norm: + self._norm = tfkl.LayerNormalization(dtype=tf.float32) + + @property + def state_size(self): + return self._size + + def call(self, inputs, state): + state = state[0] # Keras wraps the state in a list. + parts = self._layer(tf.concat([inputs, state], -1)) + if self._norm: + dtype = parts.dtype + parts = tf.cast(parts, tf.float32) + parts = self._norm(parts) + parts = tf.cast(parts, dtype) + reset, cand, update = tf.split(parts, 3, -1) + reset = tf.nn.sigmoid(reset) + cand = self._act(reset * cand) + update = tf.nn.sigmoid(update + self._update_bias) + output = update * cand + (1 - update) * state + return output, [output] diff --git a/DreamerV2/tools.py b/DreamerV2/tools.py new file mode 100644 index 0000000..5b0dbb1 --- /dev/null +++ b/DreamerV2/tools.py @@ -0,0 +1,694 @@ +import datetime +import io +import json +import pathlib +import pickle +import re +import time +import uuid + +import numpy as np +import tensorflow as tf +import tensorflow.compat.v1 as tf1 +import tensorflow_probability as tfp +from tensorflow.keras.mixed_precision import experimental as prec +from tensorflow_probability import distributions as tfd + + +# Patch to ignore seed to avoid synchronization across GPUs. +_orig_random_categorical = tf.random.categorical +def random_categorical(*args, **kwargs): + kwargs['seed'] = None + return _orig_random_categorical(*args, **kwargs) +tf.random.categorical = random_categorical + +# Patch to ignore seed to avoid synchronization across GPUs. +_orig_random_normal = tf.random.normal +def random_normal(*args, **kwargs): + kwargs['seed'] = None + return _orig_random_normal(*args, **kwargs) +tf.random.normal = random_normal + + +class AttrDict(dict): + + __setattr__ = dict.__setitem__ + __getattr__ = dict.__getitem__ + + +class Module(tf.Module): + + def save(self, filename): + values = tf.nest.map_structure(lambda x: x.numpy(), self.variables) + amount = len(tf.nest.flatten(values)) + count = int(sum(np.prod(x.shape) for x in tf.nest.flatten(values))) + print(f'Save checkpoint with {amount} tensors and {count} parameters.') + with pathlib.Path(filename).open('wb') as f: + pickle.dump(values, f) + + def load(self, filename): + with pathlib.Path(filename).open('rb') as f: + values = pickle.load(f) + amount = len(tf.nest.flatten(values)) + count = int(sum(np.prod(x.shape) for x in tf.nest.flatten(values))) + print(f'Load checkpoint with {amount} tensors and {count} parameters.') + tf.nest.map_structure(lambda x, y: x.assign(y), self.variables, values) + + def get(self, name, ctor, *args, **kwargs): + # Create or get layer by name to avoid mentioning it in the constructor. + if not hasattr(self, '_modules'): + self._modules = {} + if name not in self._modules: + self._modules[name] = ctor(*args, **kwargs) + return self._modules[name] + + +def var_nest_names(nest): + if isinstance(nest, dict): + items = ' '.join(f'{k}:{var_nest_names(v)}' for k, v in nest.items()) + return '{' + items + '}' + if isinstance(nest, (list, tuple)): + items = ' '.join(var_nest_names(v) for v in nest) + return '[' + items + ']' + if hasattr(nest, 'name') and hasattr(nest, 'shape'): + return nest.name + str(nest.shape).replace(', ', 'x') + if hasattr(nest, 'shape'): + return str(nest.shape).replace(', ', 'x') + return '?' + + +class Logger: + + def __init__(self, logdir, step): + self._logdir = logdir + self._writer = tf.summary.create_file_writer(str(logdir), max_queue=1000) + self._last_step = None + self._last_time = None + self._scalars = {} + self._images = {} + self._videos = {} + self.step = step + + def scalar(self, name, value): + self._scalars[name] = float(value) + + def image(self, name, value): + self._images[name] = np.array(value) + + def video(self, name, value): + self._videos[name] = np.array(value) + + def write(self, fps=False): + scalars = list(self._scalars.items()) + if fps: + scalars.append(('fps', self._compute_fps(self.step))) + print(f'[{self.step}]', ' / '.join(f'{k} {v:.1f}' for k, v in scalars)) + with (self._logdir / 'metrics.jsonl').open('a') as f: + f.write(json.dumps({'step': self.step, ** dict(scalars)}) + '\n') + with self._writer.as_default(): + for name, value in scalars: + tf.summary.scalar('scalars/' + name, value, self.step) + for name, value in self._images.items(): + tf.summary.image(name, value, self.step) + for name, value in self._videos.items(): + video_summary(name, value, self.step) + self._writer.flush() + self._scalars = {} + self._images = {} + self._videos = {} + + def _compute_fps(self, step): + if self._last_step is None: + self._last_time = time.time() + self._last_step = step + return 0 + steps = step - self._last_step + duration = time.time() - self._last_time + self._last_time += duration + self._last_step = step + return steps / duration + + +def graph_summary(writer, step, fn, *args): + def inner(*args): + tf.summary.experimental.set_step(step.numpy().item()) + with writer.as_default(): + fn(*args) + return tf.numpy_function(inner, args, []) + + +def video_summary(name, video, step=None, fps=20): + name = name if isinstance(name, str) else name.decode('utf-8') + if np.issubdtype(video.dtype, np.floating): + video = np.clip(255 * video, 0, 255).astype(np.uint8) + B, T, H, W, C = video.shape + try: + frames = video.transpose((1, 2, 0, 3, 4)).reshape((T, H, B * W, C)) + summary = tf1.Summary() + image = tf1.Summary.Image(height=B * H, width=T * W, colorspace=C) + image.encoded_image_string = encode_gif(frames, fps) + summary.value.add(tag=name, image=image) + tf.summary.experimental.write_raw_pb(summary.SerializeToString(), step) + except (IOError, OSError) as e: + print('GIF summaries require ffmpeg in $PATH.', e) + frames = video.transpose((0, 2, 1, 3, 4)).reshape((1, B * H, T * W, C)) + tf.summary.image(name, frames, step) + + +def encode_gif(frames, fps): + from subprocess import Popen, PIPE + h, w, c = frames[0].shape + pxfmt = {1: 'gray', 3: 'rgb24'}[c] + cmd = ' '.join([ + f'ffmpeg -y -f rawvideo -vcodec rawvideo', + f'-r {fps:.02f} -s {w}x{h} -pix_fmt {pxfmt} -i - -filter_complex', + f'[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse', + f'-r {fps:.02f} -f gif -']) + proc = Popen(cmd.split(' '), stdin=PIPE, stdout=PIPE, stderr=PIPE) + for image in frames: + proc.stdin.write(image.tostring()) + out, err = proc.communicate() + if proc.returncode: + raise IOError('\n'.join([' '.join(cmd), err.decode('utf8')])) + del proc + return out + + +def simulate(agent, envs, steps=0, episodes=0, state=None): + # Initialize or unpack simulation state. + if state is None: + step, episode = 0, 0 + done = np.ones(len(envs), np.bool) + length = np.zeros(len(envs), np.int32) + obs = [None] * len(envs) + agent_state = None + else: + step, episode, done, length, obs, agent_state = state + while (steps and step < steps) or (episodes and episode < episodes): + # Reset envs if necessary. + if done.any(): + indices = [index for index, d in enumerate(done) if d] + results = [envs[i].reset() for i in indices] + for index, result in zip(indices, results): + obs[index] = result + # Step agents. + obs = {k: np.stack([o[k] for o in obs]) for k in obs[0]} + action, agent_state = agent(obs, done, agent_state) + if isinstance(action, dict): + action = [ + {k: np.array(action[k][i]) for k in action} + for i in range(len(envs))] + else: + action = np.array(action) + assert len(action) == len(envs) + # Step envs. + results = [e.step(a) for e, a in zip(envs, action)] + obs, _, done = zip(*[p[:3] for p in results]) + obs = list(obs) + done = np.stack(done) + episode += int(done.sum()) + length += 1 + step += (done * length).sum() + length *= (1 - done) + # import pdb + # pdb.set_trace() + # Return new state to allow resuming the simulation. + return (step - steps, episode - episodes, done, length, obs, agent_state) + + +def save_episodes(directory, episodes): + directory = pathlib.Path(directory).expanduser() + directory.mkdir(parents=True, exist_ok=True) + timestamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') + filenames = [] + for episode in episodes: + identifier = str(uuid.uuid4().hex) + length = len(episode['reward']) + filename = directory / f'{timestamp}-{identifier}-{length}.npz' + with io.BytesIO() as f1: + np.savez_compressed(f1, **episode) + f1.seek(0) + with filename.open('wb') as f2: + f2.write(f1.read()) + filenames.append(filename) + return filenames + + +def sample_episodes(episodes, length=None, balance=False, seed=0): + random = np.random.RandomState(seed) + while True: + episode = random.choice(list(episodes.values())) + if length: + total = len(next(iter(episode.values()))) + available = total - length + if available < 1: + # print(f'Skipped short episode of length {available}.') + continue + if balance: + index = min(random.randint(0, total), available) + else: + index = int(random.randint(0, available + 1)) + episode = {k: v[index: index + length] for k, v in episode.items()} + yield episode + + +def load_episodes(directory, limit=None): + directory = pathlib.Path(directory).expanduser() + episodes = {} + total = 0 + for filename in reversed(sorted(directory.glob('*.npz'))): + try: + with filename.open('rb') as f: + episode = np.load(f) + episode = {k: episode[k] for k in episode.keys()} + except Exception as e: + print(f'Could not load episode: {e}') + continue + episodes[str(filename)] = episode + total += len(episode['reward']) - 1 + if limit and total >= limit: + break + return episodes + + +class DtypeDist: + + def __init__(self, dist, dtype=None): + self._dist = dist + self._dtype = dtype or prec.global_policy().compute_dtype + + @property + def name(self): + return 'DtypeDist' + + def __getattr__(self, name): + return getattr(self._dist, name) + + def mean(self): + return tf.cast(self._dist.mean(), self._dtype) + + def mode(self): + return tf.cast(self._dist.mode(), self._dtype) + + def entropy(self): + return tf.cast(self._dist.entropy(), self._dtype) + + def sample(self, *args, **kwargs): + return tf.cast(self._dist.sample(*args, **kwargs), self._dtype) + + +class SampleDist: + + def __init__(self, dist, samples=100): + self._dist = dist + self._samples = samples + + @property + def name(self): + return 'SampleDist' + + def __getattr__(self, name): + return getattr(self._dist, name) + + def mean(self): + samples = self._dist.sample(self._samples) + return tf.reduce_mean(samples, 0) + + def mode(self): + sample = self._dist.sample(self._samples) + logprob = self._dist.log_prob(sample) + return tf.gather(sample, tf.argmax(logprob))[0] + + def entropy(self): + sample = self._dist.sample(self._samples) + logprob = self.log_prob(sample) + return -tf.reduce_mean(logprob, 0) + + +class OneHotDist(tfd.OneHotCategorical): + + def __init__(self, logits=None, probs=None, dtype=None): + self._sample_dtype = dtype or prec.global_policy().compute_dtype + super().__init__(logits=logits, probs=probs) + + def mode(self): + return tf.cast(super().mode(), self._sample_dtype) + + def sample(self, sample_shape=(), seed=None): + # Straight through biased gradient estimator. + sample = tf.cast(super().sample(sample_shape, seed), self._sample_dtype) + probs = super().probs_parameter() + while len(probs.shape) < len(sample.shape): + probs = probs[None] + sample += tf.cast(probs - tf.stop_gradient(probs), self._sample_dtype) + return sample + + +class GumbleDist(tfd.RelaxedOneHotCategorical): + + def __init__(self, temp, logits=None, probs=None, dtype=None): + self._sample_dtype = dtype or prec.global_policy().compute_dtype + self._exact = tfd.OneHotCategorical(logits=logits, probs=probs) + super().__init__(temp, logits=logits, probs=probs) + + def mode(self): + return tf.cast(self._exact.mode(), self._sample_dtype) + + def entropy(self): + return tf.cast(self._exact.entropy(), self._sample_dtype) + + def sample(self, sample_shape=(), seed=None): + return tf.cast(super().sample(sample_shape, seed), self._sample_dtype) + + +class UnnormalizedHuber(tfd.Normal): + + def __init__(self, loc, scale, threshold=1, **kwargs): + self._threshold = tf.cast(threshold, loc.dtype) + super().__init__(loc, scale, **kwargs) + + def log_prob(self, event): + return -(tf.math.sqrt( + (event - self.mean()) ** 2 + self._threshold ** 2) - self._threshold) + + +class SafeTruncatedNormal(tfd.TruncatedNormal): + + def __init__(self, loc, scale, low, high, clip=1e-6, mult=1): + super().__init__(loc, scale, low, high) + self._clip = clip + self._mult = mult + + def sample(self, *args, **kwargs): + event = super().sample(*args, **kwargs) + if self._clip: + clipped = tf.clip_by_value( + event, self.low + self._clip, self.high - self._clip) + event = event - tf.stop_gradient(event) + tf.stop_gradient(clipped) + if self._mult: + event *= self._mult + return event + + +class TanhBijector(tfp.bijectors.Bijector): + + def __init__(self, validate_args=False, name='tanh'): + super().__init__( + forward_min_event_ndims=0, + validate_args=validate_args, + name=name) + + def _forward(self, x): + return tf.nn.tanh(x) + + def _inverse(self, y): + dtype = y.dtype + y = tf.cast(y, tf.float32) + y = tf.where( + tf.less_equal(tf.abs(y), 1.), + tf.clip_by_value(y, -0.99999997, 0.99999997), y) + y = tf.atanh(y) + y = tf.cast(y, dtype) + return y + + def _forward_log_det_jacobian(self, x): + log2 = tf.math.log(tf.constant(2.0, dtype=x.dtype)) + return 2.0 * (log2 - x - tf.nn.softplus(-2.0 * x)) + + +def lambda_return( + reward, value, pcont, bootstrap, lambda_, axis): + # Setting lambda=1 gives a discounted Monte Carlo return. + # Setting lambda=0 gives a fixed 1-step return. + assert reward.shape.ndims == value.shape.ndims, (reward.shape, value.shape) + if isinstance(pcont, (int, float)): + pcont = pcont * tf.ones_like(reward) + dims = list(range(reward.shape.ndims)) + dims = [axis] + dims[1:axis] + [0] + dims[axis + 1:] + if axis != 0: + reward = tf.transpose(reward, dims) + value = tf.transpose(value, dims) + pcont = tf.transpose(pcont, dims) + if bootstrap is None: + bootstrap = tf.zeros_like(value[-1]) + next_values = tf.concat([value[1:], bootstrap[None]], 0) + inputs = reward + pcont * next_values * (1 - lambda_) + returns = static_scan( + lambda agg, cur: cur[0] + cur[1] * lambda_ * agg, + (inputs, pcont), bootstrap, reverse=True) + if axis != 0: + returns = tf.transpose(returns, dims) + return returns + + +class Optimizer(tf.Module): + + def __init__( + self, name, lr, eps=1e-4, clip=None, wd=None, wd_pattern=r'.*', + opt='adam'): + assert 0 <= wd < 1 + assert not clip or 1 <= clip + self._name = name + self._clip = clip + self._wd = wd + self._wd_pattern = wd_pattern + self._opt = { + 'adam': lambda: tf.optimizers.Adam(lr, epsilon=eps), + 'nadam': lambda: tf.optimizers.Nadam(lr, epsilon=eps), + 'adamax': lambda: tf.optimizers.Adamax(lr, epsilon=eps), + 'sgd': lambda: tf.optimizers.SGD(lr), + 'momentum': lambda: tf.optimizers.SGD(lr, 0.9), + }[opt]() + self._mixed = (prec.global_policy().compute_dtype == tf.float16) + if self._mixed: + self._opt = prec.LossScaleOptimizer(self._opt, 'dynamic') + + @property + def variables(self): + return self._opt.variables() + + def __call__(self, tape, loss, modules, prefix=None): + assert loss.dtype is tf.float32, self._name + modules = modules if hasattr(modules, '__len__') else (modules,) + varibs = tf.nest.flatten([module.variables for module in modules]) + count = sum(np.prod(x.shape) for x in varibs) + print(f'Found {count} {self._name} parameters.') + assert len(loss.shape) == 0, loss.shape + tf.debugging.check_numerics(loss, self._name + '_loss') + if self._mixed: + with tape: + loss = self._opt.get_scaled_loss(loss) + grads = tape.gradient(loss, varibs) + if self._mixed: + grads = self._opt.get_unscaled_gradients(grads) + norm = tf.linalg.global_norm(grads) + if not self._mixed: + tf.debugging.check_numerics(norm, self._name + '_norm') + if self._clip: + grads, _ = tf.clip_by_global_norm(grads, self._clip, norm) + if self._wd: + self._apply_weight_decay(varibs) + self._opt.apply_gradients(zip(grads, varibs)) + metrics = {} + if prefix: + metrics[f'{prefix}/{self._name}_loss'] = loss + metrics[f'{prefix}/{self._name}_grad_norm'] = norm + if self._mixed: + metrics[f'{prefix}/{self._name}_loss_scale'] = \ + self._opt.loss_scale._current_loss_scale + else: + metrics[f'{self._name}_loss'] = loss + metrics[f'{self._name}_grad_norm'] = norm + if self._mixed: + metrics[f'{self._name}_loss_scale'] = \ + self._opt.loss_scale._current_loss_scale + return metrics + + def _apply_weight_decay(self, varibs): + nontrivial = (self._wd_pattern != r'.*') + if nontrivial: + print('Applied weight decay to variables:') + for var in varibs: + if re.search(self._wd_pattern, self._name + '/' + var.name): + if nontrivial: + print('- ' + self._name + '/' + var.name) + var.assign((1 - self._wd) * var) + + +def args_type(default): + def parse_string(x): + if default is None: + return x + if isinstance(default, bool): + return bool(['False', 'True'].index(x)) + if isinstance(default, int): + return float(x) if ('e' in x or '.' in x) else int(x) + if isinstance(default, (list, tuple)): + return tuple(args_type(default[0])(y) for y in x.split(',')) + return type(default)(x) + def parse_object(x): + if isinstance(default, (list, tuple)): + return tuple(x) + return x + return lambda x: parse_string(x) if isinstance(x, str) else parse_object(x) + + +def static_scan(fn, inputs, start, reverse=False): + last = start + outputs = [[] for _ in tf.nest.flatten(start)] + indices = range(len(tf.nest.flatten(inputs)[0])) + if reverse: + indices = reversed(indices) + for index in indices: + inp = tf.nest.map_structure(lambda x: x[index], inputs) + last = fn(last, inp) + [o.append(l) for o, l in zip(outputs, tf.nest.flatten(last))] + if reverse: + outputs = [list(reversed(x)) for x in outputs] + outputs = [tf.stack(x, 0) for x in outputs] + return tf.nest.pack_sequence_as(start, outputs) + + +def uniform_mixture(dist, dtype=None): + if dist.batch_shape[-1] == 1: + return tfd.BatchReshape(dist, dist.batch_shape[:-1]) + dtype = dtype or prec.global_policy().compute_dtype + weights = tfd.Categorical(tf.zeros(dist.batch_shape, dtype)) + return tfd.MixtureSameFamily(weights, dist) + + +def cat_mixture_entropy(dist): + if isinstance(dist, tfd.MixtureSameFamily): + probs = dist.components_distribution.probs_parameter() + else: + probs = dist.probs_parameter() + return -tf.reduce_mean( + tf.reduce_mean(probs, 2) * + tf.math.log(tf.reduce_mean(probs, 2) + 1e-8), -1) + + +@tf.function +def cem_planner( + state, num_actions, horizon, proposals, topk, iterations, imagine, + objective): + dtype = prec.global_policy().compute_dtype + B, P = list(state.values())[0].shape[0], proposals + H, A = horizon, num_actions + flat_state = {k: tf.repeat(v, P, 0) for k, v in state.items()} + mean = tf.zeros((B, H, A), dtype) + std = tf.ones((B, H, A), dtype) + for _ in range(iterations): + proposals = tf.random.normal((B, P, H, A), dtype=dtype) + proposals = proposals * std[:, None] + mean[:, None] + proposals = tf.clip_by_value(proposals, -1, 1) + flat_proposals = tf.reshape(proposals, (B * P, H, A)) + states = imagine(flat_proposals, flat_state) + scores = objective(states) + scores = tf.reshape(tf.reduce_sum(scores, -1), (B, P)) + _, indices = tf.math.top_k(scores, topk, sorted=False) + best = tf.gather(proposals, indices, axis=1, batch_dims=1) + mean, var = tf.nn.moments(best, 1) + std = tf.sqrt(var + 1e-6) + return mean[:, 0, :] + + +@tf.function +def grad_planner( + state, num_actions, horizon, proposals, iterations, imagine, objective, + kl_scale, step_size): + dtype = prec.global_policy().compute_dtype + B, P = list(state.values())[0].shape[0], proposals + H, A = horizon, num_actions + flat_state = {k: tf.repeat(v, P, 0) for k, v in state.items()} + mean = tf.zeros((B, H, A), dtype) + rawstd = 0.54 * tf.ones((B, H, A), dtype) + for _ in range(iterations): + proposals = tf.random.normal((B, P, H, A), dtype=dtype) + with tf.GradientTape(watch_accessed_variables=False) as tape: + tape.watch(mean) + tape.watch(rawstd) + std = tf.nn.softplus(rawstd) + proposals = proposals * std[:, None] + mean[:, None] + proposals = ( + tf.stop_gradient(tf.clip_by_value(proposals, -1, 1)) + + proposals - tf.stop_gradient(proposals)) + flat_proposals = tf.reshape(proposals, (B * P, H, A)) + states = imagine(flat_proposals, flat_state) + scores = objective(states) + scores = tf.reshape(tf.reduce_sum(scores, -1), (B, P)) + div = tfd.kl_divergence( + tfd.Normal(mean, std), + tfd.Normal(tf.zeros_like(mean), tf.ones_like(std))) + elbo = tf.reduce_sum(scores) - kl_scale * div + elbo /= tf.cast(tf.reduce_prod(tf.shape(scores)), dtype) + grad_mean, grad_rawstd = tape.gradient(elbo, [mean, rawstd]) + e, v = tf.nn.moments(grad_mean, [1, 2], keepdims=True) + grad_mean /= tf.sqrt(e * e + v + 1e-4) + e, v = tf.nn.moments(grad_rawstd, [1, 2], keepdims=True) + grad_rawstd /= tf.sqrt(e * e + v + 1e-4) + mean = tf.clip_by_value(mean + step_size * grad_mean, -1, 1) + rawstd = rawstd + step_size * grad_rawstd + return mean[:, 0, :] + + +class Every: + + def __init__(self, every): + self._every = every + self._last = None + + def __call__(self, step): + if not self._every: + return False + if self._last is None: + self._last = step + return True + if step >= self._last + self._every: + self._last += self._every + return True + return False + + +class Once: + + def __init__(self): + self._once = True + + def __call__(self): + if self._once: + self._once = False + return True + return False + + +class Until: + + def __init__(self, until): + self._until = until + + def __call__(self, step): + if not self._until: + return True + return step < self._until + + +def schedule(string, step): + try: + return float(string) + except ValueError: + step = tf.cast(step, tf.float32) + match = re.match(r'linear\((.+),(.+),(.+)\)', string) + if match: + initial, final, duration = [float(group) for group in match.groups()] + mix = tf.clip_by_value(step / duration, 0, 1) + return (1 - mix) * initial + mix * final + match = re.match(r'warmup\((.+),(.+)\)', string) + if match: + warmup, value = [float(group) for group in match.groups()] + scale = tf.clip_by_value(step / warmup, 0, 1) + return scale * value + match = re.match(r'exp\((.+),(.+),(.+)\)', string) + if match: + initial, final, halflife = [float(group) for group in match.groups()] + return (initial - final) * 0.5 ** (step / halflife) + final + raise NotImplementedError(string) diff --git a/DreamerV2/wrappers.py b/DreamerV2/wrappers.py new file mode 100644 index 0000000..7abd6e9 --- /dev/null +++ b/DreamerV2/wrappers.py @@ -0,0 +1,280 @@ +import threading + +import gym +import numpy as np + +class DeepMindControl: + + def __init__(self, name, action_repeat=1, size=(64, 64), camera=None): + domain, task = name.split('_', 1) + if domain == 'cup': # Only domain with multiple words. + domain = 'ball_in_cup' + if isinstance(domain, str): + from dm_control import suite + self._env = suite.load(domain, task) + else: + assert task is None + self._env = domain() + self._action_repeat = action_repeat + self._size = size + if camera is None: + camera = dict(quadruped=2).get(domain, 0) + self._camera = camera + + @property + def observation_space(self): + spaces = {} + for key, value in self._env.observation_spec().items(): + spaces[key] = gym.spaces.Box( + -np.inf, np.inf, value.shape, dtype=np.float32) + spaces['image'] = gym.spaces.Box( + 0, 255, self._size + (3,), dtype=np.uint8) + return gym.spaces.Dict(spaces) + + @property + def action_space(self): + spec = self._env.action_spec() + return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32) + + def step(self, action): + assert np.isfinite(action).all(), action + reward = 0 + for _ in range(self._action_repeat): + time_step = self._env.step(action) + reward += time_step.reward or 0 + if time_step.last(): + break + obs = dict(time_step.observation) + obs['image'] = self.render() + done = time_step.last() + info = {'discount': np.array(time_step.discount, np.float32)} + return obs, reward, done, info + + def reset(self): + time_step = self._env.reset() + obs = dict(time_step.observation) + obs['image'] = self.render() + return obs + + def render(self, *args, **kwargs): + if kwargs.get('mode', 'rgb_array') != 'rgb_array': + raise ValueError("Only render mode 'rgb_array' is supported.") + return self._env.physics.render(*self._size, camera_id=self._camera) + + +class Atari: + + LOCK = threading.Lock() + + def __init__( + self, name, action_repeat=4, size=(84, 84), grayscale=True, noops=30, + life_done=False, sticky_actions=True, all_actions=False): + assert size[0] == size[1] + import gym.wrappers + import gym.envs.atari + with self.LOCK: + env = gym.envs.atari.AtariEnv( + game=name, obs_type='image', frameskip=1, + repeat_action_probability=0.25 if sticky_actions else 0.0, + full_action_space=all_actions) + # Avoid unnecessary rendering in inner env. + env._get_obs = lambda: None + # Tell wrapper that the inner env has no action repeat. + env.spec = gym.envs.registration.EnvSpec('NoFrameskip-v0') + env = gym.wrappers.AtariPreprocessing( + env, noops, action_repeat, size[0], life_done, grayscale) + self._env = env + self._grayscale = grayscale + + @property + def observation_space(self): + return gym.spaces.Dict({ + 'image': self._env.observation_space, + 'ram': gym.spaces.Box(0, 255, (128,), np.uint8), + }) + + @property + def action_space(self): + return self._env.action_space + + def close(self): + return self._env.close() + + def reset(self): + with self.LOCK: + image = self._env.reset() + if self._grayscale: + image = image[..., None] + obs = {'image': image, 'ram': self._env.env._get_ram()} + return obs + + def step(self, action): + image, reward, done, info = self._env.step(action) + if self._grayscale: + image = image[..., None] + obs = {'image': image, 'ram': self._env.env._get_ram()} + return obs, reward, done, info + + def render(self, mode): + return self._env.render(mode) + +class CollectDataset: + + def __init__(self, env, callbacks=None, precision=32): + self._env = env + self._callbacks = callbacks or () + self._precision = precision + self._episode = None + + def __getattr__(self, name): + return getattr(self._env, name) + + def step(self, action): + obs, reward, done, info = self._env.step(action) + obs = {k: self._convert(v) for k, v in obs.items()} + transition = obs.copy() + transition['action'] = action + transition['reward'] = reward + transition['discount'] = info.get('discount', np.array(1 - float(done))) + self._episode.append(transition) + if done: + episode = {k: [t[k] for t in self._episode] for k in self._episode[0]} + episode = {k: self._convert(v) for k, v in episode.items()} + info['episode'] = episode + for callback in self._callbacks: + callback(episode) + return obs, reward, done, info + + def reset(self): + obs = self._env.reset() + transition = obs.copy() + transition['action'] = np.zeros(self._env.action_space.shape) + transition['reward'] = 0.0 + transition['discount'] = 1.0 + self._episode = [transition] + return obs + + def _convert(self, value): + value = np.array(value) + if np.issubdtype(value.dtype, np.floating): + dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self._precision] + elif np.issubdtype(value.dtype, np.signedinteger): + dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision] + elif np.issubdtype(value.dtype, np.uint8): + dtype = np.uint8 + else: + raise NotImplementedError(value.dtype) + return value.astype(dtype) + + +class TimeLimit: + + def __init__(self, env, duration): + self._env = env + self._duration = duration + self._step = None + + def __getattr__(self, name): + return getattr(self._env, name) + + def step(self, action): + assert self._step is not None, 'Must reset environment.' + obs, reward, done, info = self._env.step(action) + self._step += 1 + if self._step >= self._duration: + done = True + if 'discount' not in info: + info['discount'] = np.array(1.0).astype(np.float32) + self._step = None + return obs, reward, done, info + + def reset(self): + self._step = 0 + return self._env.reset() + + +class NormalizeActions: + + def __init__(self, env): + self._env = env + self._mask = np.logical_and( + np.isfinite(env.action_space.low), + np.isfinite(env.action_space.high)) + self._low = np.where(self._mask, env.action_space.low, -1) + self._high = np.where(self._mask, env.action_space.high, 1) + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def action_space(self): + low = np.where(self._mask, -np.ones_like(self._low), self._low) + high = np.where(self._mask, np.ones_like(self._low), self._high) + return gym.spaces.Box(low, high, dtype=np.float32) + + def step(self, action): + original = (action + 1) / 2 * (self._high - self._low) + self._low + original = np.where(self._mask, original, action) + return self._env.step(original) + + +class OneHotAction: + + def __init__(self, env): + assert isinstance(env.action_space, gym.spaces.Discrete) + self._env = env + self._random = np.random.RandomState() + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def action_space(self): + shape = (self._env.action_space.n,) + space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32) + space.sample = self._sample_action + return space + + def step(self, action): + index = np.argmax(action).astype(int) + reference = np.zeros_like(action) + reference[index] = 1 + if not np.allclose(reference, action): + raise ValueError(f'Invalid one-hot action:\n{action}') + return self._env.step(index) + + def reset(self): + return self._env.reset() + + def _sample_action(self): + actions = self._env.action_space.n + index = self._random.randint(0, actions) + reference = np.zeros(actions, dtype=np.float32) + reference[index] = 1.0 + return reference + + +class RewardObs: + + def __init__(self, env): + self._env = env + + def __getattr__(self, name): + return getattr(self._env, name) + + @property + def observation_space(self): + spaces = self._env.observation_space.spaces + assert 'reward' not in spaces + spaces['reward'] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32) + return gym.spaces.Dict(spaces) + + def step(self, action): + obs, reward, done, info = self._env.step(action) + obs['reward'] = reward + return obs, reward, done, info + + def reset(self): + obs = self._env.reset() + obs['reward'] = 0.0 + return obs diff --git a/README.md b/README.md new file mode 100644 index 0000000..99bbbfe --- /dev/null +++ b/README.md @@ -0,0 +1,126 @@ +# Learning Task Informed Abstractions (TIA) + +Left to right: Raw Observation, Dreamer, Joint of TIA, Task Stream of TIA, Distractor Stream of TIA + +![](imgs/gt.gif) ![](imgs/pred.gif) ![](imgs/joint.gif) ![](imgs/main.gif) ![](imgs/disen.gif) + + +This code base contains a minimal modification over [Dreamer](https://danijar.com/project/dreamer/)/[DreamerV2](https://danijar.com/project/dreamerv2/) to learn disentangled world models, presented in: + +**Learning Task Informed Abstractions** + +Xiang Fu*, Ge Yang*, Pulkit Agrawal, Tommi Jaakkola + +ICML 2021 [[website]](https://xiangfu.co/tia) [[paper]](https://arxiv.org/abs/2106.15612) + + +The directory [Dreamer](./Dreamer) contains code for running DMC experiments. The directory [DreamerV2](./DreamerV2) contains code for running Atari experiments. This implementation is tested with Python 3.6, Tensorflow 2.3.1 and CUDA 10.1. The training/evaluation metrics used for producing the figures in the paper can be downloaded from [this Google Drive link](https://drive.google.com/file/d/1wvSp9Q7r2Ah5xRE_x3nJy-uwLkjF2RgX/view?usp=sharing). + +If you find this code useful, please consider citing: + +``` +@inproceedings{fu2021learning, + title={Learning Task Informed Abstractions}, + author={Fu, Xiang and Yang, Ge and Agrawal, Pulkit and Jaakkola, Tommi}, + booktitle={ICML}, + year={2021} +} +``` + +## Getting started + +Get dependencies: + +```sh +pip3 install --user tensorflow-gpu==2.3.1 +pip3 install --user tensorflow_probability==0.11.0 +pip3 install --user gym +pip3 install --user pandas +pip3 install --user matplotlib +pip3 install --user ruamel.yaml +pip3 install --user scikit-image +pip3 install --user git+git://github.com/deepmind/dm_control.git +pip3 install --user 'gym[atari]' +``` +You will need an active Mujoco license for running DMC experiments. + +## Running DMC experiments with distracting background + +Code for running DMC experiments is under the directory [Dreamer](./Dreamer). + +To run DMC experiments with distracting video backgrounds, you can download a small set of 16 videos (videos with names starting with ''A'' in the Kinetics 400 dataset's `driving_car` class) from [this Google Drive link](https://drive.google.com/file/d/1f-ER2XnhpvQeGjlJaoGRiLR0oEjn6Le_/view?usp=sharing), which is used for producing Figure 9(a) in the paper's appendix. + +To replicate the setup of [DBC](https://github.com/facebookresearch/deep_bisim4control) and use more background videos, first download the Kinetics 400 dataset and grab the `driving_car` label from the train dataset. Use the repo: + +[https://github.com/Showmax/kinetics-downloader](https://github.com/Showmax/kinetics-downloader) + +to download the dataset. + +Train the agent: + +```sh +python run.py --method dreamer --configs dmc --task dmc_cheetah_run_driving --logdir ~/logdir --video_dir VIDPATH +``` + +`VIDPATH` should contains `*.mp4` video files. (if you used the above repo to download the Kinetics videos, you should set `VIDPATH` to `PATH_TO_REPO/kinetics-downloader/dataset/train/driving_car`) + + +Choose method from: + +``` +[dreamer, tia, inverse] +``` + +corresponding to the original Dreamer, TIA, and representation learned with an inverse model as described in Section 4.2 of the paper. + + +Choose environment + distraction (e.g. `dmc_cheetah_run_driving`): + +``` +dmc_{domain}_{task}_{distraction} +``` + +where {domain} (e.g., cheetah, walker, hopper, etc.) and {task} (e.g., run, walk, stand, etc.) are from the DeepMind Control Suite, and distraction can be chosen from: + +``` +[none, noise, driving] +``` + +where each option uses different backgrounds: +``` +none: default (no) background + +noise: white noise background + +driving: natural videos from the ''driving car'' class as background +``` + +## Running Atari experiments + +Code for running Atari experiments is under the directory [DreamerV2](./DreamerV2). + +Train the agent with the game Demon Attack: + +```sh +python dreamer.py --logdir ~/logdir/atari_demon_attack/TIA/1 \ + --configs defaults atari --task atari_demon_attack +``` + +## Monitoring results + +Both DMC and Atari experiments log with tensorboard by default. The decomposition of the two streams of TIA is visualized in `.gif` animation. Access tensorboard with the command: + +```sh +tensorboard --logdir LOGDIR +``` + + +## Reference + +We modify [Dreamer](https://github.com/danijar/dreamer) for DMC environments and [DreamerV2](https://github.com/danijar/dreamerv2) for Atari games. Thanks Danijar for releasing his very clean implementation! Utilities such as + +- Logging with Tensorboard/JSON line files +- debugging with the `debug` flag +- mixed precision training + +are the same as in the respective original implementations. diff --git a/imgs/disen.gif b/imgs/disen.gif new file mode 100644 index 0000000..77d36e8 Binary files /dev/null and b/imgs/disen.gif differ diff --git a/imgs/gt.gif b/imgs/gt.gif new file mode 100644 index 0000000..d3e115b Binary files /dev/null and b/imgs/gt.gif differ diff --git a/imgs/joint.gif b/imgs/joint.gif new file mode 100644 index 0000000..f978b29 Binary files /dev/null and b/imgs/joint.gif differ diff --git a/imgs/main.gif b/imgs/main.gif new file mode 100644 index 0000000..675d596 Binary files /dev/null and b/imgs/main.gif differ diff --git a/imgs/pred.gif b/imgs/pred.gif new file mode 100644 index 0000000..65c3578 Binary files /dev/null and b/imgs/pred.gif differ