initial commit.

2021-06-29 21:20:44 -04:00 · 2021-06-29 21:20:44 -04:00 · 297b5bb62d
commit 297b5bb62d
87 changed files with 13309 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 __pycache__/
 *.py[cod]
 *.egg-info
 ./dist
 MUJOCO_LOG.TXT
--- a/Dreamer/dmc2gym/init.py
+++ b/Dreamer/dmc2gym/init.py
@ -0,0 +1,52 @@
 import gym
 from gym.envs.registration import register
 def make(
    domain_name,
    task_name,
    resource_files,
    img_source,
    total_frames,
    seed=1,
    visualize_reward=True,
    from_pixels=False,
    height=84,
    width=84,
    camera_id=0,
    frame_skip=1,
    episode_length=1000,
    environment_kwargs=None
 ):
    env_id = 'dmc_%s_%s_%s-v1' % (domain_name, task_name, seed)
    if from_pixels:
        assert not visualize_reward, 'cannot use visualize reward when learning from pixels'
    # shorten episode length
    max_episode_steps = (episode_length + frame_skip - 1) // frame_skip
    if not env_id in gym.envs.registry.env_specs:
        register(
            id=env_id,
            entry_point='dmc2gym.wrappers:DMCWrapper',
            kwargs={
                'domain_name': domain_name,
                'task_name': task_name,
                'resource_files': resource_files,
                'img_source': img_source,
                'total_frames': total_frames,
                'task_kwargs': {
                    'random': seed
                },
                'environment_kwargs': environment_kwargs,
                'visualize_reward': visualize_reward,
                'from_pixels': from_pixels,
                'height': height,
                'width': width,
                'camera_id': camera_id,
                'frame_skip': frame_skip,
            },
            # max_episode_steps=max_episode_steps
        )
    return gym.make(env_id)
--- a/Dreamer/dmc2gym/natural_imgsource.py
+++ b/Dreamer/dmc2gym/natural_imgsource.py
@ -0,0 +1,183 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import numpy as np
 import cv2
 import skvideo.io
 import random
 import tqdm
 class BackgroundMatting(object):
    """
    Produce a mask by masking the given color. This is a simple strategy
    but effective for many games.
    """
    def __init__(self, color):
        """
        Args:
            color: a (r, g, b) tuple or single value for grayscale
        """
        self._color = color
    def get_mask(self, img):
        return img == self._color
 class ImageSource(object):
    """
    Source of natural images to be added to a simulated environment.
    """
    def get_image(self):
        """
        Returns:
            an RGB image of [h, w, 3] with a fixed shape.
        """
        pass
    def reset(self):
        """ Called when an episode ends. """
        pass
 class FixedColorSource(ImageSource):
    def __init__(self, shape, color):
        """
        Args:
            shape: [h, w]
            color: a 3-tuple
        """
        self.arr = np.zeros((shape[0], shape[1], 3))
        self.arr[:, :] = color
    def get_image(self):
        return self.arr
 class RandomColorSource(ImageSource):
    def __init__(self, shape):
        """
        Args:
            shape: [h, w]
        """
        self.shape = shape
        self.arr = None
        self.reset()
    def reset(self):
        self._color = np.random.randint(0, 256, size=(3,))
        self.arr = np.zeros((self.shape[0], self.shape[1], 3))
        self.arr[:, :] = self._color
    def get_image(self):
        return self.arr
 class NoiseSource(ImageSource):
    def __init__(self, shape, strength=255):
        """
        Args:
            shape: [h, w]
            strength (int): the strength of noise, in range [0, 255]
        """
        self.shape = shape
        self.strength = strength
    def get_image(self):
        return np.random.randn(self.shape[0], self.shape[1], 3) * self.strength
 class RandomImageSource(ImageSource):
    def __init__(self, shape, filelist, total_frames=None, grayscale=False):
        """
        Args:
            shape: [h, w]
            filelist: a list of image files
        """
        self.grayscale = grayscale
        self.total_frames = total_frames
        self.shape = shape
        self.filelist = filelist
        self.build_arr()
        self.current_idx = 0
        self.reset()
    def build_arr(self):
        self.total_frames = self.total_frames if self.total_frames else len(self.filelist)
        self.arr = np.zeros((self.total_frames, self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,)))
        for i in range(self.total_frames):
            # if i % len(self.filelist) == 0: random.shuffle(self.filelist)
            fname = self.filelist[i % len(self.filelist)]
            if self.grayscale: im = cv2.imread(fname, cv2.IMREAD_GRAYSCALE)[..., None]
            else:              im = cv2.imread(fname, cv2.IMREAD_COLOR)
            self.arr[i] = cv2.resize(im, (self.shape[1], self.shape[0])) ## THIS IS NOT A BUG! cv2 uses (width, height)
    def reset(self):
        self._loc = np.random.randint(0, self.total_frames)
    def get_image(self):
        return self.arr[self._loc]
 class RandomVideoSource(ImageSource):
    def __init__(self, shape, filelist, total_frames=None, grayscale=False):
        """
        Args:
            shape: [h, w]
            filelist: a list of video files
        """
        self.grayscale = grayscale
        self.total_frames = total_frames
        self.shape = shape
        self.filelist = filelist
        self.build_arr()
        self.current_idx = 0
        self.reset()
    def build_arr(self):
        if not self.total_frames:
            self.total_frames = 0
            self.arr = None
            random.shuffle(self.filelist)
            for fname in tqdm.tqdm(self.filelist, desc="Loading videos for natural", position=0):
                if self.grayscale: frames = skvideo.io.vread(fname, outputdict={"-pix_fmt": "gray"})
                else:              frames = skvideo.io.vread(fname)
                local_arr = np.zeros((frames.shape[0], self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,)))
                for i in tqdm.tqdm(range(frames.shape[0]), desc="video frames", position=1):
                    local_arr[i] = cv2.resize(frames[i], (self.shape[1], self.shape[0])) ## THIS IS NOT A BUG! cv2 uses (width, height)
                if self.arr is None:
                    self.arr = local_arr
                else:
                    self.arr = np.concatenate([self.arr, local_arr], 0)
                self.total_frames += local_arr.shape[0]
        else:
            self.arr = np.zeros((self.total_frames, self.shape[0], self.shape[1]) + ((3,) if not self.grayscale else (1,)))
            total_frame_i = 0
            file_i = 0
            with tqdm.tqdm(total=self.total_frames, desc="Loading videos for natural") as pbar:
                while total_frame_i < self.total_frames:
                    if file_i % len(self.filelist) == 0: random.shuffle(self.filelist)
                    file_i += 1
                    fname = self.filelist[file_i % len(self.filelist)]
                    if self.grayscale: frames = skvideo.io.vread(fname, outputdict={"-pix_fmt": "gray"})
                    else:              frames = skvideo.io.vread(fname)
                    for frame_i in range(frames.shape[0]):
                        if total_frame_i >= self.total_frames: break
                        if self.grayscale:
                            self.arr[total_frame_i] = cv2.resize(frames[frame_i], (self.shape[1], self.shape[0]))[..., None] ## THIS IS NOT A BUG! cv2 uses (width, height)
                        else:
                            self.arr[total_frame_i] = cv2.resize(frames[frame_i], (self.shape[1], self.shape[0])) 
                        pbar.update(1)
                        total_frame_i += 1
    def reset(self):
        self._loc = np.random.randint(0, self.total_frames)
    def get_image(self):
        img = self.arr[self._loc % self.total_frames]
        self._loc += 1
        return img
--- a/Dreamer/dmc2gym/wrappers.py
+++ b/Dreamer/dmc2gym/wrappers.py
@ -0,0 +1,201 @@
 from gym import core, spaces
 import glob
 import os
 import local_dm_control_suite as suite
 from dm_env import specs
 import numpy as np
 import skimage.io
 from dmc2gym import natural_imgsource
 def _spec_to_box(spec):
    def extract_min_max(s):
        assert s.dtype == np.float64 or s.dtype == np.float32
        dim = np.int(np.prod(s.shape))
        if type(s) == specs.Array:
            bound = np.inf * np.ones(dim, dtype=np.float32)
            return -bound, bound
        elif type(s) == specs.BoundedArray:
            zeros = np.zeros(dim, dtype=np.float32)
            return s.minimum + zeros, s.maximum + zeros
    mins, maxs = [], []
    for s in spec:
        mn, mx = extract_min_max(s)
        mins.append(mn)
        maxs.append(mx)
    low = np.concatenate(mins, axis=0)
    high = np.concatenate(maxs, axis=0)
    assert low.shape == high.shape
    return spaces.Box(low, high, dtype=np.float32)
 def _flatten_obs(obs):
    obs_pieces = []
    for v in obs.values():
        flat = np.array([v]) if np.isscalar(v) else v.ravel()
        obs_pieces.append(flat)
    return np.concatenate(obs_pieces, axis=0)
 class DMCWrapper(core.Env):
    def __init__(
        self,
        domain_name,
        task_name,
        resource_files,
        img_source,
        total_frames,
        task_kwargs=None,
        visualize_reward={},
        from_pixels=False,
        height=84,
        width=84,
        camera_id=0,
        frame_skip=1,
        environment_kwargs=None
    ):
        assert 'random' in task_kwargs, 'please specify a seed, for deterministic behaviour'
        self._from_pixels = from_pixels
        self._height = height
        self._width = width
        self._camera_id = camera_id
        self._frame_skip = frame_skip
        self._img_source = img_source
        self._resource_files = resource_files
        # create task
        self._env = suite.load(
            domain_name=domain_name,
            task_name=task_name,
            task_kwargs=task_kwargs,
            visualize_reward=visualize_reward,
            environment_kwargs=environment_kwargs
        )
        # true and normalized action spaces
        self._true_action_space = _spec_to_box([self._env.action_spec()])
        self._norm_action_space = spaces.Box(
            low=-1.0,
            high=1.0,
            shape=self._true_action_space.shape,
            dtype=np.float32
        )
        # create observation space
        if from_pixels:
            self._observation_space = spaces.Box(
                low=0, high=255, shape=[3, height, width], dtype=np.uint8
            )
        else:
            self._observation_space = _spec_to_box(
                self._env.observation_spec().values()
            )
        self._internal_state_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=self._env.physics.get_state().shape,
            dtype=np.float32
        )
        # background
        if img_source is not None:
            shape2d = (height, width)
            if img_source == "color":
                self._bg_source = natural_imgsource.RandomColorSource(shape2d)
            elif img_source == "noise":
                self._bg_source = natural_imgsource.NoiseSource(shape2d)
            else:
                files = glob.glob(os.path.expanduser(resource_files))
                assert len(files), "Pattern {} does not match any files".format(
                    resource_files
                )
                if img_source == "images":
                    self._bg_source = natural_imgsource.RandomImageSource(shape2d, files, grayscale=True, total_frames=total_frames)
                elif img_source == "video":
                    self._bg_source = natural_imgsource.RandomVideoSource(shape2d, files, grayscale=True, total_frames=total_frames)
                else:
                    raise Exception("img_source %s not defined." % img_source)
        # set seed
        self.seed(seed=task_kwargs.get('random', 1))
    def __getattr__(self, name):
        return getattr(self._env, name)
    def _get_obs(self, time_step):
        if self._from_pixels:
            obs = self.render(
                height=self._height,
                width=self._width,
                camera_id=self._camera_id
            )
            if self._img_source is not None:
                mask = np.logical_and((obs[:, :, 2] > obs[:, :, 1]), (obs[:, :, 2] > obs[:, :, 0]))  # hardcoded for dmc
                bg = self._bg_source.get_image()
                obs[mask] = bg[mask]
            # obs = obs.transpose(2, 0, 1).copy()
            # CHW to HWC for tensorflow
            obs = obs.copy()
        else:
            obs = _flatten_obs(time_step.observation)
        return obs
    def _convert_action(self, action):
        action = action.astype(np.float64)
        true_delta = self._true_action_space.high - self._true_action_space.low
        norm_delta = self._norm_action_space.high - self._norm_action_space.low
        action = (action - self._norm_action_space.low) / norm_delta
        action = action * true_delta + self._true_action_space.low
        action = action.astype(np.float32)
        return action
    @property
    def observation_space(self):
        return self._observation_space
    @property
    def internal_state_space(self):
        return self._internal_state_space
    @property
    def action_space(self):
        return self._norm_action_space
    def seed(self, seed):
        self._true_action_space.seed(seed)
        self._norm_action_space.seed(seed)
        self._observation_space.seed(seed)
    def step(self, action):
        assert self._norm_action_space.contains(action)
        action = self._convert_action(action)
        assert self._true_action_space.contains(action)
        reward = 0
        extra = {'internal_state': self._env.physics.get_state().copy()}
        for _ in range(self._frame_skip):
            time_step = self._env.step(action)
            reward += time_step.reward or 0
            done = time_step.last()
            if done:
                break
        obs = self._get_obs(time_step)
        extra['discount'] = time_step.discount
        return obs, reward, done, extra
    def reset(self):
        time_step = self._env.reset()
        obs = self._get_obs(time_step)
        return obs
    def render(self, mode='rgb_array', height=None, width=None, camera_id=0):
        assert mode == 'rgb_array', 'only support rgb_array mode, given %s' % mode
        height = height or self._height
        width = width or self._width
        camera_id = camera_id or self._camera_id
        return self._env.physics.render(
            height=height, width=width, camera_id=camera_id
        )
--- a/Dreamer/dreamers.py
+++ b/Dreamer/dreamers.py
@ -0,0 +1,739 @@
 import tools
 import models
 from tensorflow_probability import distributions as tfd
 from tensorflow.keras.mixed_precision import experimental as prec
 import tensorflow as tf
 import numpy as np
 import collections
 import functools
 import json
 import time
 from env_tools import preprocess, count_steps
 def load_dataset(directory, config):
    episode = next(tools.load_episodes(directory, 1))
    types = {k: v.dtype for k, v in episode.items()}
    shapes = {k: (None,) + v.shape[1:] for k, v in episode.items()}
    def generator(): return tools.load_episodes(
        directory, config.train_steps, config.batch_length,
        config.dataset_balance)
    dataset = tf.data.Dataset.from_generator(generator, types, shapes)
    dataset = dataset.batch(config.batch_size, drop_remainder=True)
    dataset = dataset.map(functools.partial(preprocess, config=config))
    dataset = dataset.prefetch(10)
    return dataset
 class Dreamer(tools.Module):
    def __init__(self, config, datadir, actspace, writer):
        self._c = config
        self._actspace = actspace
        self._actdim = actspace.n if hasattr(
            actspace, 'n') else actspace.shape[0]
        self._writer = writer
        self._random = np.random.RandomState(config.seed)
        self._should_pretrain = tools.Once()
        self._should_train = tools.Every(config.train_every)
        self._should_log = tools.Every(config.log_every)
        self._last_log = None
        self._last_time = time.time()
        self._metrics = collections.defaultdict(tf.metrics.Mean)
        self._metrics['expl_amount']  # Create variable for checkpoint.
        self._float = prec.global_policy().compute_dtype
        self._strategy = tf.distribute.MirroredStrategy()
        with tf.device('cpu:0'):
            self._step = tf.Variable(count_steps(
                datadir, config), dtype=tf.int64)
        with self._strategy.scope():
            self._dataset = iter(self._strategy.experimental_distribute_dataset(
                load_dataset(datadir, self._c)))
        self._build_model()
    def __call__(self, obs, reset, state=None, training=True):
        step = self._step.numpy().item()
        tf.summary.experimental.set_step(step)
        if state is not None and reset.any():
            mask = tf.cast(1 - reset, self._float)[:, None]
            state = tf.nest.map_structure(lambda x: x * mask, state)
        if self._should_train(step):
            log = self._should_log(step)
            n = self._c.pretrain if self._should_pretrain() else self._c.train_steps
            print(f'Training for {n} steps.')
            with self._strategy.scope():
                for train_step in range(n):
                    log_images = self._c.log_images and log and train_step == 0
                    self.train(next(self._dataset), log_images)
            if log:
                self._write_summaries()
        action, state = self.policy(obs, state, training)
        if training:
            self._step.assign_add(len(reset) * self._c.action_repeat)
        return action, state
    @tf.function
    def policy(self, obs, state, training):
        if state is None:
            latent = self._dynamics.initial(len(obs['image']))
            action = tf.zeros((len(obs['image']), self._actdim), self._float)
        else:
            latent, action = state
        embed = self._encode(preprocess(obs, self._c))
        latent, _ = self._dynamics.obs_step(latent, action, embed)
        feat = self._dynamics.get_feat(latent)
        if training:
            action = self._actor(feat).sample()
        else:
            action = self._actor(feat).mode()
        action = self._exploration(action, training)
        state = (latent, action)
        return action, state
    def load(self, filename):
        super().load(filename)
        self._should_pretrain()
    @tf.function()
    def train(self, data, log_images=False):
        self._strategy.experimental_run_v2(
            self._train, args=(data, log_images))
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)
        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)
        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)
        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)
        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm,
                    actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._decode = models.ConvDecoder(
            self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)
        model_modules = [self._encode, self._dynamics,
                         self._decode, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))
    def _exploration(self, action, training):
        if training:
            amount = self._c.expl_amount
            if self._c.expl_decay:
                amount *= 0.5 ** (tf.cast(self._step,
                                          tf.float32) / self._c.expl_decay)
            if self._c.expl_min:
                amount = tf.maximum(self._c.expl_min, amount)
            self._metrics['expl_amount'].update_state(amount)
        elif self._c.eval_noise:
            amount = self._c.eval_noise
        else:
            return action
        if self._c.expl == 'additive_gaussian':
            return tf.clip_by_value(tfd.Normal(action, amount).sample(), -1, 1)
        if self._c.expl == 'completely_random':
            return tf.random.uniform(action.shape, -1, 1)
        if self._c.expl == 'epsilon_greedy':
            indices = tfd.Categorical(0 * action).sample()
            # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
            return tf.where(
                tf.random.uniform(action.shape[:1], 0, 1) < amount,
                tf.one_hot(indices, action.shape[-1], dtype=self._float),
                action)
        raise NotImplementedError(self._c.expl)
    def _imagine_ahead(self, post):
        if self._c.pcont:  # Last step could be terminal.
            post = {k: v[:, :-1] for k, v in post.items()}
        def flatten(x): return tf.reshape(x, [-1] + list(x.shape[2:]))
        start = {k: flatten(v) for k, v in post.items()}
        def policy(state): return self._actor(
            tf.stop_gradient(self._dynamics.get_feat(state))).sample()
        states = tools.static_scan(
            lambda prev, _: self._dynamics.img_step(prev, policy(prev)),
            tf.range(self._c.horizon), start)
        imag_feat = self._dynamics.get_feat(states)
        return imag_feat
    def _scalar_summaries(
            self, data, feat, prior_dist, post_dist, likes, div,
            model_loss, value_loss, actor_loss, model_norm, value_norm,
            actor_norm):
        self._metrics['model_grad_norm'].update_state(model_norm)
        self._metrics['value_grad_norm'].update_state(value_norm)
        self._metrics['actor_grad_norm'].update_state(actor_norm)
        self._metrics['prior_ent'].update_state(prior_dist.entropy())
        self._metrics['post_ent'].update_state(post_dist.entropy())
        for name, logprob in likes.items():
            self._metrics[name + '_loss'].update_state(-logprob)
        self._metrics['div'].update_state(div)
        self._metrics['model_loss'].update_state(model_loss)
        self._metrics['value_loss'].update_state(value_loss)
        self._metrics['actor_loss'].update_state(actor_loss)
        self._metrics['action_ent'].update_state(self._actor(feat).entropy())
    def _image_summaries(self, data, embed, image_pred):
        truth = data['image'][:6] + 0.5
        recon = image_pred.mode()[:6]
        init, _ = self._dynamics.observe(embed[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        prior = self._dynamics.imagine(data['action'][:6, 5:], init)
        openl = self._decode(self._dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'agent/openl', openl)
    def image_summary_from_data(self, data):
        truth = data['image'][:6] + 0.5
        embed = self._encode(data)
        post, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        feat = self._dynamics.get_feat(post)
        init = {k: v[:, -1] for k, v in post.items()}
        recon = self._decode(feat).mode()[:6]
        prior = self._dynamics.imagine(data['action'][:6, 5:], init)
        openl = self._decode(self._dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'agent/eval_openl', openl)
    def _write_summaries(self):
        step = int(self._step.numpy())
        metrics = [(k, float(v.result())) for k, v in self._metrics.items()]
        if self._last_log is not None:
            duration = time.time() - self._last_time
            self._last_time += duration
            metrics.append(('fps', (step - self._last_log) / duration))
        self._last_log = step
        [m.reset_states() for m in self._metrics.values()]
        with (self._c.logdir / 'metrics.jsonl').open('a') as f:
            f.write(json.dumps({'step': step, **dict(metrics)}) + '\n')
        [tf.summary.scalar('agent/' + k, m) for k, m in metrics]
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics))
        self._writer.flush()
 class SeparationDreamer(Dreamer):
    def __init__(self, config, datadir, actspace, writer):
        self._metrics_disen = collections.defaultdict(tf.metrics.Mean)
        self._metrics_disen['expl_amount']
        super().__init__(config, datadir, actspace, writer)   
    def _train(self, data, log_images):
        with tf.GradientTape(persistent=True) as model_tape:
            # main
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            # disen
            embed_disen = self._disen_encode(data)
            post_disen, prior_disen = self._disen_dynamics.observe(
                embed_disen, data['action'])
            feat_disen = self._disen_dynamics.get_feat(post_disen)
            # disen image pred
            image_pred_disen = self._disen_only_decode(feat_disen)
            # joint image pred
            image_pred_joint, image_pred_joint_main, image_pred_joint_disen, mask_pred = self._joint_decode(
                feat, feat_disen)
            # reward pred
            reward_pred = self._reward(feat)
            # optimize disen reward predictor till optimal
            for _ in range(self._c.num_reward_opt_iters):
                with tf.GradientTape() as disen_reward_tape:
                    reward_pred_disen = self._disen_reward(
                        tf.stop_gradient(feat_disen))
                    reward_like_disen = reward_pred_disen.log_prob(
                        data['reward'])
                    reward_loss_disen = -tf.reduce_mean(reward_like_disen)
                    reward_loss_disen /= float(
                        self._strategy.num_replicas_in_sync)
                reward_disen_norm = self._disen_reward_opt(
                    disen_reward_tape, reward_loss_disen)
            # disen reward pred with optimal reward predictor
            reward_pred_disen = self._disen_reward(feat_disen)
            reward_like_disen = tf.reduce_mean(
                reward_pred_disen.log_prob(data['reward']))
            # main model loss
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(
                image_pred_joint.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(
                data['reward'])) * self._c.reward_scale
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)
            # disen model loss with reward negative gradient
            likes_disen = tools.AttrDict()
            likes_disen.image = tf.reduce_mean(
                image_pred_joint.log_prob(data['image']))
            likes_disen.disen_only = tf.reduce_mean(
                image_pred_disen.log_prob(data['image']))
            reward_like_disen = reward_pred_disen.log_prob(data['reward'])
            reward_like_disen = tf.reduce_mean(reward_like_disen)
            reward_loss_disen = -reward_like_disen
            prior_dist_disen = self._disen_dynamics.get_dist(prior_disen)
            post_dist_disen = self._disen_dynamics.get_dist(post_disen)
            div_disen = tf.reduce_mean(tfd.kl_divergence(
                post_dist_disen, prior_dist_disen))
            div_disen = tf.maximum(div_disen, self._c.free_nats)
            model_loss_disen = div_disen * self._c.disen_kl_scale + \
                reward_like_disen * self._c.disen_neg_rew_scale - \
                likes_disen.image - likes_disen.disen_only * self._c.disen_rec_scale
            model_loss_disen /= float(self._strategy.num_replicas_in_sync)
            decode_loss = model_loss_disen + model_loss
        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)
        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)
        model_norm = self._model_opt(model_tape, model_loss)
        model_disen_norm = self._disen_opt(model_tape, model_loss_disen)
        decode_norm = self._decode_opt(model_tape, decode_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)
        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm)
                self._scalar_summaries_disen(
                    prior_dist_disen, post_dist_disen, likes_disen, div_disen,
                    model_loss_disen, reward_loss_disen,
                    model_disen_norm, reward_disen_norm)
            if tf.equal(log_images, True):
                self._image_summaries_joint(
                    data, embed, embed_disen, image_pred_joint, mask_pred)
                self._image_summaries(
                    self._disen_dynamics, self._disen_decode, data, embed_disen, image_pred_joint_disen, tag='disen/openl_joint_disen')
                self._image_summaries(
                    self._disen_dynamics, self._disen_only_decode, data, embed_disen, image_pred_disen, tag='disen_only/openl_disen_only')
                self._image_summaries(
                    self._dynamics, self._main_decode, data, embed, image_pred_joint_main, tag='main/openl_joint_main')
    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        # Distractor dynamic model
        self._disen_encode = models.ConvEncoder(
            self._c.disen_cnn_depth, cnn_act, self._c.image_size)
        self._disen_dynamics = models.RSSM(
            self._c.disen_stoch_size, self._c.disen_deter_size, self._c.disen_deter_size)
        self._disen_only_decode = models.ConvDecoder(
            self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._disen_reward = models.DenseDecoder(
            (), 2, self._c.num_units, act=act)
        # Task dynamic model
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)
        # Joint decode
        self._main_decode = models.ConvDecoderMask(
            self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._disen_decode = models.ConvDecoderMask(
            self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._joint_decode = models.ConvDecoderMaskEnsemble(
            self._main_decode, self._disen_decode, self._c.precision
        )
        disen_modules = [self._disen_encode,
                         self._disen_dynamics, self._disen_only_decode]
        model_modules = [self._encode, self._dynamics, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._disen_opt = Optimizer('disen', disen_modules, self._c.model_lr)
        self._decode_opt = Optimizer(
            'decode', [self._joint_decode], self._c.model_lr)
        self._disen_reward_opt = Optimizer(
            'disen_reward', [self._disen_reward], self._c.disen_reward_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))
    def _scalar_summaries_disen(
            self, prior_dist_disen, post_dist_disen, likes_disen, div_disen,
            model_loss_disen, reward_loss_disen,
            model_disen_norm, reward_disen_norm):
        self._metrics_disen['model_grad_norm'].update_state(model_disen_norm)
        self._metrics_disen['reward_grad_norm'].update_state(reward_disen_norm)
        self._metrics_disen['prior_ent'].update_state(
            prior_dist_disen.entropy())
        self._metrics_disen['post_ent'].update_state(post_dist_disen.entropy())
        for name, logprob in likes_disen.items():
            self._metrics_disen[name + '_loss'].update_state(-logprob)
        self._metrics_disen['div'].update_state(div_disen)
        self._metrics_disen['model_loss'].update_state(model_loss_disen)
        self._metrics_disen['reward_loss'].update_state(
            reward_loss_disen)
    def _image_summaries(self, dynamics, decoder, data, embed, image_pred, tag='agent/openl'):
        truth = data['image'][:6] + 0.5
        recon = image_pred.mode()[:6]
        init, _ = dynamics.observe(embed[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        prior = dynamics.imagine(data['action'][:6, 5:], init)
        if isinstance(decoder, models.ConvDecoderMask):
            openl, _ = decoder(dynamics.get_feat(prior))
            openl = openl.mode()
        else:
            openl = decoder(dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, tag, openl)
    def _image_summaries_joint(self, data, embed, embed_disen, image_pred_joint, mask_pred):
        truth = data['image'][:6] + 0.5
        recon_joint = image_pred_joint.mode()[:6]
        mask_pred = mask_pred[:6]
        init, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        init_disen, _ = self._disen_dynamics.observe(
            embed_disen[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        init_disen = {k: v[:, -1] for k, v in init_disen.items()}
        prior = self._dynamics.imagine(
            data['action'][:6, 5:], init)
        prior_disen = self._disen_dynamics.imagine(
            data['action'][:6, 5:], init_disen)
        feat = self._dynamics.get_feat(prior)
        feat_disen = self._disen_dynamics.get_feat(prior_disen)
        openl, _, _, openl_mask = self._joint_decode(feat, feat_disen)
        openl = openl.mode()
        model = tf.concat([recon_joint[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        openl_mask = tf.concat([mask_pred[:, :5] + 0.5, openl_mask + 0.5], 1)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'joint/openl_joint', openl)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'mask/openl_mask', openl_mask)
    def image_summary_from_data(self, data):
        truth = data['image'][:6] + 0.5
        # main
        embed = self._encode(data)
        post, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        feat = self._dynamics.get_feat(post)
        init = {k: v[:, -1] for k, v in post.items()}
        # disen
        embed_disen = self._disen_encode(data)
        post_disen, _ = self._disen_dynamics.observe(
            embed_disen[:6, :5], data['action'][:6, :5])
        feat_disen = self._disen_dynamics.get_feat(post_disen)
        init_disen = {k: v[:, -1] for k, v in post_disen.items()}
        # joint image pred
        recon_joint, recon_main, recon_disen, recon_mask = self._joint_decode(
            feat, feat_disen)
        recon_joint = recon_joint.mode()[:6]
        recon_main = recon_main.mode()[:6]
        recon_disen = recon_disen.mode()[:6]
        recon_mask = recon_mask[:6]
        prior = self._dynamics.imagine(
            data['action'][:6, 5:], init)
        prior_disen = self._disen_dynamics.imagine(
            data['action'][:6, 5:], init_disen)
        feat = self._dynamics.get_feat(prior)
        feat_disen = self._disen_dynamics.get_feat(prior_disen)
        openl_joint, openl_main, openl_disen, openl_mask = self._joint_decode(
            feat, feat_disen)
        openl_joint = openl_joint.mode()
        openl_main = openl_main.mode()
        openl_disen = openl_disen.mode()
        model_joint = tf.concat(
            [recon_joint[:, :5] + 0.5, openl_joint + 0.5], 1)
        error_joint = (model_joint - truth + 1) / 2
        model_main = tf.concat(
            [recon_main[:, :5] + 0.5, openl_main + 0.5], 1)
        model_disen = tf.concat(
            [recon_disen[:, :5] + 0.5, openl_disen + 0.5], 1)
        model_mask = tf.concat(
            [recon_mask[:, :5] + 0.5, openl_mask + 0.5], 1)
        output_joint = tf.concat(
            [truth, model_main, model_disen, model_joint, error_joint], 2)
        output_mask = model_mask
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'summary/openl', output_joint)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'summary/openl_mask', output_mask)
    def _write_summaries(self):
        step = int(self._step.numpy())
        metrics = [(k, float(v.result())) for k, v in self._metrics.items()]
        metrics_disen = [(k, float(v.result()))
                         for k, v in self._metrics_disen.items()]
        if self._last_log is not None:
            duration = time.time() - self._last_time
            self._last_time += duration
            metrics.append(('fps', (step - self._last_log) / duration))
        self._last_log = step
        [m.reset_states() for m in self._metrics.values()]
        [m.reset_states() for m in self._metrics_disen.values()]
        with (self._c.logdir / 'metrics.jsonl').open('a') as f:
            f.write(json.dumps({'step': step, **dict(metrics)}) + '\n')
        [tf.summary.scalar('agent/' + k, m) for k, m in metrics]
        [tf.summary.scalar('disen/' + k, m) for k, m in metrics_disen]
        print('#'*30 + ' Main ' + '#'*30)
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics))
        print('#'*30 + ' Disen ' + '#'*30)
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics_disen))
        self._writer.flush() 
 class InverseDreamer(Dreamer):
    def __init__(self, config, datadir, actspace, writer):
        super().__init__(config, datadir, actspace, writer)
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            action_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.action = tf.reduce_mean(
                action_pred.log_prob(data['action'][:, :-1]))
            likes.reward = tf.reduce_mean(
                reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)
        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)
        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)
        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)
        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm,
                    actor_norm)
    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._decode = models.InverseDecoder(
            self._actdim, 4, self._c.num_units, act=act)
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)
        model_modules = [self._encode, self._dynamics,
                         self._decode, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))
--- a/Dreamer/env_tools.py
+++ b/Dreamer/env_tools.py
@ -0,0 +1,84 @@
 import os
 import json
 import dmc2gym
 import tensorflow as tf
 from tensorflow.keras.mixed_precision import experimental as prec
 import tools
 import wrappers
 def preprocess(obs, config):
    dtype = prec.global_policy().compute_dtype
    obs = obs.copy()
    with tf.device('cpu:0'):
        obs['image'] = tf.cast(obs['image'], dtype) / 255.0 - 0.5
        clip_rewards = dict(none=lambda x: x, tanh=tf.tanh)[
            config.clip_rewards]
        obs['reward'] = clip_rewards(obs['reward'])
    return obs
 def count_steps(datadir, config):
    return tools.count_episodes(datadir)[1] * config.action_repeat
 def summarize_episode(episode, config, datadir, writer, prefix):
    episodes, steps = tools.count_episodes(datadir)
    length = (len(episode['reward']) - 1) * config.action_repeat
    ret = episode['reward'].sum()
    print(f'{prefix.title()} episode of length {length} with return {ret:.1f}.')
    metrics = [
        (f'{prefix}/return', float(episode['reward'].sum())),
        (f'{prefix}/length', len(episode['reward']) - 1),
        ('episodes', episodes)]
    step = count_steps(datadir, config)
    with (config.logdir / 'metrics.jsonl').open('a') as f:
        f.write(json.dumps(dict([('step', step)] + metrics)) + '\n')
    with writer.as_default():  # Env might run in a different thread.
        tf.summary.experimental.set_step(step)
        [tf.summary.scalar('sim/' + k, v) for k, v in metrics]
        if prefix == 'test':
            tools.video_summary(f'sim/{prefix}/video', episode['image'][None])
 def make_env(config, writer, prefix, datadir, video_dir, store):
    suite, domain_task_distractor = config.task.split('_', 1)
    domain, task_distractor = domain_task_distractor.split('_', 1)
    task, distractor = task_distractor.split('_', 1)
    if distractor == 'driving':
        img_source = 'video'
        total_frames = 1000
        resource_files = os.path.join(video_dir, '*.mp4')
    elif distractor == 'noise':
        img_source = 'noise'
        total_frames = None
        resource_files = None
    elif distractor == 'none':
        img_source = None
        total_frames = None
        resource_files = None
    else:
        raise NotImplementedError
    env = dmc2gym.make(
        domain_name=domain,
        task_name=task,
        resource_files=resource_files,
        img_source=img_source,
        total_frames=total_frames,
        seed=config.seed,
        visualize_reward=False,
        from_pixels=True,
        height=config.image_size,
        width=config.image_size,
        frame_skip=config.action_repeat
    )
    env = wrappers.DMC2GYMWrapper(env)
    env = wrappers.TimeLimit(env, config.time_limit / config.action_repeat)
    callbacks = []
    if store:
        callbacks.append(lambda ep: tools.save_episodes(datadir, [ep]))
    callbacks.append(
        lambda ep: summarize_episode(ep, config, datadir, writer, prefix))
    env = wrappers.Collect(env, callbacks, config.precision)
    env = wrappers.RewardObs(env)
    return env
--- a/Dreamer/local_dm_control_suite/README.md
+++ b/Dreamer/local_dm_control_suite/README.md
@ -0,0 +1,56 @@
 # DeepMind Control Suite.
 This submodule contains the domains and tasks described in the
 [DeepMind Control Suite tech report](https://arxiv.org/abs/1801.00690).
 ## Quickstart
 ```python
 from dm_control import suite
 import numpy as np
 # Load one task:
 env = suite.load(domain_name="cartpole", task_name="swingup")
 # Iterate over a task set:
 for domain_name, task_name in suite.BENCHMARKING:
  env = suite.load(domain_name, task_name)
 # Step through an episode and print out reward, discount and observation.
 action_spec = env.action_spec()
 time_step = env.reset()
 while not time_step.last():
  action = np.random.uniform(action_spec.minimum,
                             action_spec.maximum,
                             size=action_spec.shape)
  time_step = env.step(action)
  print(time_step.reward, time_step.discount, time_step.observation)
 ```
 ## Illustration video
 Below is a video montage of solved Control Suite tasks, with reward
 visualisation enabled.
 [![Video montage](https://img.youtube.com/vi/rAai4QzcYbs/0.jpg)](https://www.youtube.com/watch?v=rAai4QzcYbs)
 ### Quadruped domain [April 2019]
 Roughly based on the 'ant' model introduced by [Schulman et al. 2015](https://arxiv.org/abs/1506.02438). Main modifications to the body are:
 - 4 DoFs per leg, 1 constraining tendon.
 - 3 actuators per leg: 'yaw', 'lift', 'extend'.
 - Filtered position actuators with timescale of 100ms.
 - Sensors include an IMU, force/torque sensors, and rangefinders.
 Four tasks:
 - `walk` and `run`: self-right the body then move forward at a desired speed.
 - `escape`: escape a bowl-shaped random terrain (uses rangefinders).
 - `fetch`, go to a moving ball and bring it to a target.
 All behaviors in the video below were trained with [Abdolmaleki et al's
 MPO](https://arxiv.org/abs/1806.06920).
 [![Video montage](https://img.youtube.com/vi/RhRLjbb7pBE/0.jpg)](https://www.youtube.com/watch?v=RhRLjbb7pBE)
--- a/Dreamer/local_dm_control_suite/init.py
+++ b/Dreamer/local_dm_control_suite/init.py
@ -0,0 +1,151 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """A collection of MuJoCo-based Reinforcement Learning environments."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 import inspect
 import itertools
 from dm_control.rl import control
 from local_dm_control_suite import acrobot
 from local_dm_control_suite import ball_in_cup
 from local_dm_control_suite import cartpole
 from local_dm_control_suite import cheetah
 from local_dm_control_suite import finger
 from local_dm_control_suite import fish
 from local_dm_control_suite import hopper
 from local_dm_control_suite import humanoid
 from local_dm_control_suite import humanoid_CMU
 from local_dm_control_suite import lqr
 from local_dm_control_suite import manipulator
 from local_dm_control_suite import pendulum
 from local_dm_control_suite import point_mass
 from local_dm_control_suite import quadruped
 from local_dm_control_suite import reacher
 from local_dm_control_suite import stacker
 from local_dm_control_suite import swimmer
 from local_dm_control_suite import walker
 # Find all domains imported.
 _DOMAINS = {name: module for name, module in locals().items()
            if inspect.ismodule(module) and hasattr(module, 'SUITE')}
 def _get_tasks(tag):
  """Returns a sequence of (domain name, task name) pairs for the given tag."""
  result = []
  for domain_name in sorted(_DOMAINS.keys()):
    domain = _DOMAINS[domain_name]
    if tag is None:
      tasks_in_domain = domain.SUITE
    else:
      tasks_in_domain = domain.SUITE.tagged(tag)
    for task_name in tasks_in_domain.keys():
      result.append((domain_name, task_name))
  return tuple(result)
 def _get_tasks_by_domain(tasks):
  """Returns a dict mapping from task name to a tuple of domain names."""
  result = collections.defaultdict(list)
  for domain_name, task_name in tasks:
    result[domain_name].append(task_name)
  return {k: tuple(v) for k, v in result.items()}
 # A sequence containing all (domain name, task name) pairs.
 ALL_TASKS = _get_tasks(tag=None)
 # Subsets of ALL_TASKS, generated via the tag mechanism.
 BENCHMARKING = _get_tasks('benchmarking')
 EASY = _get_tasks('easy')
 HARD = _get_tasks('hard')
 EXTRA = tuple(sorted(set(ALL_TASKS) - set(BENCHMARKING)))
 # A mapping from each domain name to a sequence of its task names.
 TASKS_BY_DOMAIN = _get_tasks_by_domain(ALL_TASKS)
 def load(domain_name, task_name, task_kwargs=None, environment_kwargs=None,
         visualize_reward=False):
  """Returns an environment from a domain name, task name and optional settings.
  ```python
  env = suite.load('cartpole', 'balance')
  ```
  Args:
    domain_name: A string containing the name of a domain.
    task_name: A string containing the name of a task.
    task_kwargs: Optional `dict` of keyword arguments for the task.
    environment_kwargs: Optional `dict` specifying keyword arguments for the
      environment.
    visualize_reward: Optional `bool`. If `True`, object colours in rendered
      frames are set to indicate the reward at each step. Default `False`.
  Returns:
    The requested environment.
  """
  return build_environment(domain_name, task_name, task_kwargs,
                           environment_kwargs, visualize_reward)
 def build_environment(domain_name, task_name, task_kwargs=None,
                      environment_kwargs=None, visualize_reward=False):
  """Returns an environment from the suite given a domain name and a task name.
  Args:
    domain_name: A string containing the name of a domain.
    task_name: A string containing the name of a task.
    task_kwargs: Optional `dict` specifying keyword arguments for the task.
    environment_kwargs: Optional `dict` specifying keyword arguments for the
      environment.
    visualize_reward: Optional `bool`. If `True`, object colours in rendered
      frames are set to indicate the reward at each step. Default `False`.
  Raises:
    ValueError: If the domain or task doesn't exist.
  Returns:
    An instance of the requested environment.
  """
  if domain_name not in _DOMAINS:
    raise ValueError('Domain {!r} does not exist.'.format(domain_name))
  domain = _DOMAINS[domain_name]
  if task_name not in domain.SUITE:
    raise ValueError('Level {!r} does not exist in domain {!r}.'.format(
        task_name, domain_name))
  task_kwargs = task_kwargs or {}
  if environment_kwargs is not None:
    task_kwargs = task_kwargs.copy()
    task_kwargs['environment_kwargs'] = environment_kwargs
  env = domain.SUITE[task_name](**task_kwargs)
  env.task.visualize_reward = visualize_reward
  return env
--- a/Dreamer/local_dm_control_suite/acrobot.py
+++ b/Dreamer/local_dm_control_suite/acrobot.py
@ -0,0 +1,127 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Acrobot domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 10
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('acrobot.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns Acrobot balance task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(sparse=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add('benchmarking')
 def swingup_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None,
                   environment_kwargs=None):
  """Returns Acrobot sparse balance."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(sparse=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Acrobot domain."""
  def horizontal(self):
    """Returns horizontal (x) component of body frame z-axes."""
    return self.named.data.xmat[['upper_arm', 'lower_arm'], 'xz']
  def vertical(self):
    """Returns vertical (z) component of body frame z-axes."""
    return self.named.data.xmat[['upper_arm', 'lower_arm'], 'zz']
  def to_target(self):
    """Returns the distance from the tip to the target."""
    tip_to_target = (self.named.data.site_xpos['target'] -
                     self.named.data.site_xpos['tip'])
    return np.linalg.norm(tip_to_target)
  def orientations(self):
    """Returns the sines and cosines of the pole angles."""
    return np.concatenate((self.horizontal(), self.vertical()))
 class Balance(base.Task):
  """An Acrobot `Task` to swing up and balance the pole."""
  def __init__(self, sparse, random=None):
    """Initializes an instance of `Balance`.
    Args:
      sparse: A `bool` specifying whether to use a sparse (indicator) reward.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._sparse = sparse
    super(Balance, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Shoulder and elbow are set to a random position between [-pi, pi).
    Args:
      physics: An instance of `Physics`.
    """
    physics.named.data.qpos[
        ['shoulder', 'elbow']] = self.random.uniform(-np.pi, np.pi, 2)
    super(Balance, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of pole orientation and angular velocities."""
    obs = collections.OrderedDict()
    obs['orientations'] = physics.orientations()
    obs['velocity'] = physics.velocity()
    return obs
  def _get_reward(self, physics, sparse):
    target_radius = physics.named.model.site_size['target', 0]
    return rewards.tolerance(physics.to_target(),
                             bounds=(0, target_radius),
                             margin=0 if sparse else 1)
  def get_reward(self, physics):
    """Returns a sparse or a smooth reward, as specified in the constructor."""
    return self._get_reward(physics, sparse=self._sparse)
--- a/Dreamer/local_dm_control_suite/acrobot.xml
+++ b/Dreamer/local_dm_control_suite/acrobot.xml
@ -0,0 +1,43 @@
 <!--
 Based on Coulomb's [1] rather than Spong's [2] model.
 [1] Coulom, Rémi. Reinforcement learning using neural networks, with applications to motor control.
    Diss. Institut National Polytechnique de Grenoble-INPG, 2002.
 [2] Spong, Mark W. "The swing up control problem for the acrobot."
    IEEE control systems 15, no. 1 (1995): 49-55.
 -->
 <mujoco model="acrobot">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <default>
    <joint damping=".05"/>
    <geom type="capsule" mass="1"/>
  </default>
  <option timestep="0.01" integrator="RK4">
    <flag constraint="disable" energy="enable"/>
  </option>
  <worldbody>
    <light name="light" pos="0 0 6"/>
    <geom name="floor" size="3 3 .2" type="plane" material="grid"/>
    <site name="target" type="sphere" pos="0 0 4" size="0.2" material="target" group="3"/>
    <camera name="fixed" pos="0 -6 2" zaxis="0 -1 0"/>
    <camera name="lookat" mode="targetbodycom" target="upper_arm" pos="0 -2 3"/>
    <body name="upper_arm" pos="0 0 2">
      <joint name="shoulder" type="hinge" axis="0 1 0"/>
      <geom name="upper_arm_decoration" material="decoration" type="cylinder" fromto="0 -.06 0 0 .06 0" size="0.051" mass="0"/>
      <geom name="upper_arm" fromto="0 0 0 0 0 1" size="0.05" material="self"/>
      <body name="lower_arm" pos="0 0 1">
        <joint name="elbow" type="hinge" axis="0 1 0"/>
        <geom name="lower_arm" fromto="0 0 0 0 0 1" size="0.049" material="self"/>
        <site name="tip" pos="0 0 1" size="0.01"/>
      </body>
    </body>
  </worldbody>
   <actuator>
    <motor name="elbow" joint="elbow" gear="2" ctrllimited="true" ctrlrange="-1 1"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/ball_in_cup.py
+++ b/Dreamer/local_dm_control_suite/ball_in_cup.py
@ -0,0 +1,100 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Ball-in-Cup Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 _DEFAULT_TIME_LIMIT = 20  # (seconds)
 _CONTROL_TIMESTEP = .02   # (seconds)
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('ball_in_cup.xml'), common.ASSETS
@SUITE.add('benchmarking', 'easy')
 def catch(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Ball-in-Cup task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = BallInCup(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics with additional features for the Ball-in-Cup domain."""
  def ball_to_target(self):
    """Returns the vector from the ball to the target."""
    target = self.named.data.site_xpos['target', ['x', 'z']]
    ball = self.named.data.xpos['ball', ['x', 'z']]
    return target - ball
  def in_target(self):
    """Returns 1 if the ball is in the target, 0 otherwise."""
    ball_to_target = abs(self.ball_to_target())
    target_size = self.named.model.site_size['target', [0, 2]]
    ball_size = self.named.model.geom_size['ball', 0]
    return float(all(ball_to_target < target_size - ball_size))
 class BallInCup(base.Task):
  """The Ball-in-Cup task. Put the ball in the cup."""
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Find a collision-free random initial position of the ball.
    penetrating = True
    while penetrating:
      # Assign a random ball position.
      physics.named.data.qpos['ball_x'] = self.random.uniform(-.2, .2)
      physics.named.data.qpos['ball_z'] = self.random.uniform(.2, .5)
      # Check for collisions.
      physics.after_reset()
      penetrating = physics.data.ncon > 0
    super(BallInCup, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the state."""
    obs = collections.OrderedDict()
    obs['position'] = physics.position()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a sparse reward."""
    return physics.in_target()
--- a/Dreamer/local_dm_control_suite/ball_in_cup.xml
+++ b/Dreamer/local_dm_control_suite/ball_in_cup.xml
@ -0,0 +1,54 @@
 <mujoco model="ball in cup">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <default>
    <motor ctrllimited="true" ctrlrange="-1 1" gear="5"/>
    <default class="cup">
      <joint type="slide" damping="3" stiffness="20"/>
      <geom type="capsule" size=".008" material="self"/>
    </default>
  </default>
  <worldbody>
    <light name="light" directional="true" diffuse=".6 .6 .6" pos="0 0 2" specular=".3 .3 .3"/>
    <geom name="ground" type="plane" pos="0 0 0" size=".6 .2 10" material="grid"/>
    <camera name="cam0" pos="0 -1 .8" xyaxes="1 0 0 0 1 2"/>
    <camera name="cam1" pos="0 -1 .4" xyaxes="1 0 0 0 0 1" />
    <body name="cup" pos="0 0 .6" childclass="cup">
      <joint name="cup_x" axis="1 0 0"/>
      <joint name="cup_z" axis="0 0 1"/>
      <geom name="cup_part_0" fromto="-.05 0 0 -.05 0 -.075" />
      <geom name="cup_part_1" fromto="-.05 0 -.075 -.025 0 -.1" />
      <geom name="cup_part_2" fromto="-.025 0 -.1 .025 0 -.1" />
      <geom name="cup_part_3" fromto=".025 0 -.1 .05 0 -.075" />
      <geom name="cup_part_4" fromto=".05 0 -.075 .05 0 0" />
      <site name="cup" pos="0 0 -.108" size=".005"/>
      <site name="target" type="box" pos="0 0 -.05" size=".05 .006 .05" group="4"/>
    </body>
    <body name="ball" pos="0 0 .2">
      <joint name="ball_x" type="slide" axis="1 0 0"/>
      <joint name="ball_z" type="slide" axis="0 0 1"/>
      <geom name="ball" type="sphere" size=".025" material="effector"/>
      <site name="ball" size=".005"/>
    </body>
  </worldbody>
  <actuator>
    <motor name="x" joint="cup_x"/>
    <motor name="z" joint="cup_z"/>
  </actuator>
  <tendon>
    <spatial name="string" limited="true" range="0 0.3" width="0.003">
      <site site="ball"/>
      <site site="cup"/>
    </spatial>
  </tendon>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/base.py
+++ b/Dreamer/local_dm_control_suite/base.py
@ -0,0 +1,112 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Base class for tasks in the Control Suite."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from dm_control import mujoco
 from dm_control.rl import control
 import numpy as np
 class Task(control.Task):
  """Base class for tasks in the Control Suite.
  Actions are mapped directly to the states of MuJoCo actuators: each element of
  the action array is used to set the control input for a single actuator. The
  ordering of the actuators is the same as in the corresponding MJCF XML file.
  Attributes:
    random: A `numpy.random.RandomState` instance. This should be used to
      generate all random variables associated with the task, such as random
      starting states, observation noise* etc.
  *If sensor noise is enabled in the MuJoCo model then this will be generated
  using MuJoCo's internal RNG, which has its own independent state.
  """
  def __init__(self, random=None):
    """Initializes a new continuous control task.
    Args:
      random: Optional, either a `numpy.random.RandomState` instance, an integer
        seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    if not isinstance(random, np.random.RandomState):
      random = np.random.RandomState(random)
    self._random = random
    self._visualize_reward = False
  @property
  def random(self):
    """Task-specific `numpy.random.RandomState` instance."""
    return self._random
  def action_spec(self, physics):
    """Returns a `BoundedArraySpec` matching the `physics` actuators."""
    return mujoco.action_spec(physics)
  def initialize_episode(self, physics):
    """Resets geom colors to their defaults after starting a new episode.
    Subclasses of `base.Task` must delegate to this method after performing
    their own initialization.
    Args:
      physics: An instance of `mujoco.Physics`.
    """
    self.after_step(physics)
  def before_step(self, action, physics):
    """Sets the control signal for the actuators to values in `action`."""
    # Support legacy internal code.
    action = getattr(action, "continuous_actions", action)
    physics.set_control(action)
  def after_step(self, physics):
    """Modifies colors according to the reward."""
    if self._visualize_reward:
      reward = np.clip(self.get_reward(physics), 0.0, 1.0)
      _set_reward_colors(physics, reward)
  @property
  def visualize_reward(self):
    return self._visualize_reward
  @visualize_reward.setter
  def visualize_reward(self, value):
    if not isinstance(value, bool):
      raise ValueError("Expected a boolean, got {}.".format(type(value)))
    self._visualize_reward = value
 _MATERIALS = ["self", "effector", "target"]
 _DEFAULT = [name + "_default" for name in _MATERIALS]
 _HIGHLIGHT = [name + "_highlight" for name in _MATERIALS]
 def _set_reward_colors(physics, reward):
  """Sets the highlight, effector and target colors according to the reward."""
  assert 0.0 <= reward <= 1.0
  colors = physics.named.model.mat_rgba
  default = colors[_DEFAULT]
  highlight = colors[_HIGHLIGHT]
  blend_coef = reward ** 4  # Better color distinction near high rewards.
  colors[_MATERIALS] = blend_coef * highlight + (1.0 - blend_coef) * default
--- a/Dreamer/local_dm_control_suite/cartpole.py
+++ b/Dreamer/local_dm_control_suite/cartpole.py
@ -0,0 +1,230 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Cartpole domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 from lxml import etree
 import numpy as np
 from six.moves import range
 _DEFAULT_TIME_LIMIT = 10
 SUITE = containers.TaggedTasks()
 def get_model_and_assets(num_poles=1):
  """Returns a tuple containing the model XML string and a dict of assets."""
  return _make_model(num_poles), common.ASSETS
@SUITE.add('benchmarking')
 def balance(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns the Cartpole Balance task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(swing_up=False, sparse=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add('benchmarking')
 def balance_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None,
                   environment_kwargs=None):
  """Returns the sparse reward variant of the Cartpole Balance task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(swing_up=False, sparse=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add('benchmarking')
 def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns the Cartpole Swing-Up task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(swing_up=True, sparse=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add('benchmarking')
 def swingup_sparse(time_limit=_DEFAULT_TIME_LIMIT, random=None,
                   environment_kwargs=None):
  """Returns the sparse reward variant of teh Cartpole Swing-Up task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Balance(swing_up=True, sparse=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add()
 def two_poles(time_limit=_DEFAULT_TIME_LIMIT, random=None,
              environment_kwargs=None):
  """Returns the Cartpole Balance task with two poles."""
  physics = Physics.from_xml_string(*get_model_and_assets(num_poles=2))
  task = Balance(swing_up=True, sparse=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add()
 def three_poles(time_limit=_DEFAULT_TIME_LIMIT, random=None, num_poles=3,
                sparse=False, environment_kwargs=None):
  """Returns the Cartpole Balance task with three or more poles."""
  physics = Physics.from_xml_string(*get_model_and_assets(num_poles=num_poles))
  task = Balance(swing_up=True, sparse=sparse, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
 def _make_model(n_poles):
  """Generates an xml string defining a cart with `n_poles` bodies."""
  xml_string = common.read_model('cartpole.xml')
  if n_poles == 1:
    return xml_string
  mjcf = etree.fromstring(xml_string)
  parent = mjcf.find('./worldbody/body/body')  # Find first pole.
  # Make chain of poles.
  for pole_index in range(2, n_poles+1):
    child = etree.Element('body', name='pole_{}'.format(pole_index),
                          pos='0 0 1', childclass='pole')
    etree.SubElement(child, 'joint', name='hinge_{}'.format(pole_index))
    etree.SubElement(child, 'geom', name='pole_{}'.format(pole_index))
    parent.append(child)
    parent = child
  # Move plane down.
  floor = mjcf.find('./worldbody/geom')
  floor.set('pos', '0 0 {}'.format(1 - n_poles - .05))
  # Move cameras back.
  cameras = mjcf.findall('./worldbody/camera')
  cameras[0].set('pos', '0 {} 1'.format(-1 - 2*n_poles))
  cameras[1].set('pos', '0 {} 2'.format(-2*n_poles))
  return etree.tostring(mjcf, pretty_print=True)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Cartpole domain."""
  def cart_position(self):
    """Returns the position of the cart."""
    return self.named.data.qpos['slider'][0]
  def angular_vel(self):
    """Returns the angular velocity of the pole."""
    return self.data.qvel[1:]
  def pole_angle_cosine(self):
    """Returns the cosine of the pole angle."""
    return self.named.data.xmat[2:, 'zz']
  def bounded_position(self):
    """Returns the state, with pole angle split into sin/cos."""
    return np.hstack((self.cart_position(),
                      self.named.data.xmat[2:, ['zz', 'xz']].ravel()))
 class Balance(base.Task):
  """A Cartpole `Task` to balance the pole.
  State is initialized either close to the target configuration or at a random
  configuration.
  """
  _CART_RANGE = (-.25, .25)
  _ANGLE_COSINE_RANGE = (.995, 1)
  def __init__(self, swing_up, sparse, random=None):
    """Initializes an instance of `Balance`.
    Args:
      swing_up: A `bool`, which if `True` sets the cart to the middle of the
        slider and the pole pointing towards the ground. Otherwise, sets the
        cart to a random position on the slider and the pole to a random
        near-vertical position.
      sparse: A `bool`, whether to return a sparse or a smooth reward.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._sparse = sparse
    self._swing_up = swing_up
    super(Balance, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Initializes the cart and pole according to `swing_up`, and in both cases
    adds a small random initial velocity to break symmetry.
    Args:
      physics: An instance of `Physics`.
    """
    nv = physics.model.nv
    if self._swing_up:
      physics.named.data.qpos['slider'] = .01*self.random.randn()
      physics.named.data.qpos['hinge_1'] = np.pi + .01*self.random.randn()
      physics.named.data.qpos[2:] = .1*self.random.randn(nv - 2)
    else:
      physics.named.data.qpos['slider'] = self.random.uniform(-.1, .1)
      physics.named.data.qpos[1:] = self.random.uniform(-.034, .034, nv - 1)
    physics.named.data.qvel[:] = 0.01 * self.random.randn(physics.model.nv)
    super(Balance, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the (bounded) physics state."""
    obs = collections.OrderedDict()
    obs['position'] = physics.bounded_position()
    obs['velocity'] = physics.velocity()
    return obs
  def _get_reward(self, physics, sparse):
    if sparse:
      cart_in_bounds = rewards.tolerance(physics.cart_position(),
                                         self._CART_RANGE)
      angle_in_bounds = rewards.tolerance(physics.pole_angle_cosine(),
                                          self._ANGLE_COSINE_RANGE).prod()
      return cart_in_bounds * angle_in_bounds
    else:
      upright = (physics.pole_angle_cosine() + 1) / 2
      centered = rewards.tolerance(physics.cart_position(), margin=2)
      centered = (1 + centered) / 2
      small_control = rewards.tolerance(physics.control(), margin=1,
                                        value_at_margin=0,
                                        sigmoid='quadratic')[0]
      small_control = (4 + small_control) / 5
      small_velocity = rewards.tolerance(physics.angular_vel(), margin=5).min()
      small_velocity = (1 + small_velocity) / 2
      return upright.mean() * small_control * small_velocity * centered
  def get_reward(self, physics):
    """Returns a sparse or a smooth reward, as specified in the constructor."""
    return self._get_reward(physics, sparse=self._sparse)
--- a/Dreamer/local_dm_control_suite/cartpole.xml
+++ b/Dreamer/local_dm_control_suite/cartpole.xml
@ -0,0 +1,37 @@
 <mujoco model="cart-pole">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.01" integrator="RK4">
    <flag contact="disable" energy="enable"/>
  </option>
  <default>
    <default class="pole">
      <joint type="hinge" axis="0 1 0"  damping="2e-6"/>
      <geom type="capsule" fromto="0 0 0 0 0 1" size="0.045" material="self" mass=".1"/>
    </default>
  </default>
  <worldbody>
    <light name="light" pos="0 0 6"/>
    <camera name="fixed" pos="0 -4 1" zaxis="0 -1 0"/>
    <camera name="lookatcart" mode="targetbody" target="cart" pos="0 -2 2"/>
    <geom name="floor" pos="0 0 -.05" size="4 4 .2" type="plane" material="grid"/>
    <geom name="rail1" type="capsule" pos="0  .07 1" zaxis="1 0 0" size="0.02 2" material="decoration" />
    <geom name="rail2" type="capsule" pos="0 -.07 1" zaxis="1 0 0" size="0.02 2" material="decoration" />
    <body name="cart" pos="0 0 1">
      <joint name="slider" type="slide" limited="true" axis="1 0 0" range="-1.8 1.8" solreflimit=".08 1" damping="5e-4"/>
      <geom name="cart" type="box" size="0.2 0.15 0.1" material="self"  mass="1"/>
      <body name="pole_1" childclass="pole">
        <joint name="hinge_1"/>
        <geom name="pole_1"/>
      </body>
    </body>
  </worldbody>
  <actuator>
    <motor name="slide" joint="slider" gear="10" ctrllimited="true" ctrlrange="-1 1" />
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/cheetah.py
+++ b/Dreamer/local_dm_control_suite/cheetah.py
@ -0,0 +1,97 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Cheetah Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 # How long the simulation will run, in seconds.
 _DEFAULT_TIME_LIMIT = 10
 # Running speed above which reward is 1.
 _RUN_SPEED = 10
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('cheetah.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the run task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Cheetah(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Cheetah domain."""
  def speed(self):
    """Returns the horizontal speed of the Cheetah."""
    return self.named.data.sensordata['torso_subtreelinvel'][0]
 class Cheetah(base.Task):
  """A `Task` to train a running Cheetah."""
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    # The indexing below assumes that all joints have a single DOF.
    assert physics.model.nq == physics.model.njnt
    is_limited = physics.model.jnt_limited == 1
    lower, upper = physics.model.jnt_range[is_limited].T
    physics.data.qpos[is_limited] = self.random.uniform(lower, upper)
    # Stabilize the model before the actual simulation.
    for _ in range(200):
      physics.step()
    physics.data.time = 0
    self._timeout_progress = 0
    super(Cheetah, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the state, ignoring horizontal position."""
    obs = collections.OrderedDict()
    # Ignores horizontal position to maintain translational invariance.
    obs['position'] = physics.data.qpos[1:].copy()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    return rewards.tolerance(physics.speed(),
                             bounds=(_RUN_SPEED, float('inf')),
                             margin=_RUN_SPEED,
                             value_at_margin=0,
                             sigmoid='linear')
--- a/Dreamer/local_dm_control_suite/cheetah.xml
+++ b/Dreamer/local_dm_control_suite/cheetah.xml
@ -0,0 +1,73 @@
 <mujoco model="cheetah">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials_white_floor.xml"/>
  <compiler settotalmass="14"/>
  <default>
    <default class="cheetah">
      <joint limited="true" damping=".01" armature=".1" stiffness="8" type="hinge" axis="0 1 0"/>
      <geom contype="1" conaffinity="1" condim="3" friction=".4 .1 .1" material="self"/>
    </default>
    <default class="free">
      <joint limited="false" damping="0" armature="0" stiffness="0"/>
    </default>
    <motor ctrllimited="true" ctrlrange="-1 1"/>
  </default>
  <statistic center="0 0 .7" extent="2"/>
  <option timestep="0.01"/>
  <worldbody>
    <geom name="ground" type="plane" conaffinity="1" pos="98 0 0" size="100 .8 .5" material="grid"/>
    <body name="torso" pos="0 0 .7" childclass="cheetah">
      <light name="light" pos="0 0 2" mode="trackcom"/>
      <camera name="side" pos="0 -3 0" quat="0.707 0.707 0 0" mode="trackcom"/>
      <camera name="back" pos="-1.8 -1.3 0.8" xyaxes="0.45 -0.9 0 0.3 0.15 0.94" mode="trackcom"/>
      <joint name="rootx" type="slide" axis="1 0 0" class="free"/>
      <joint name="rootz" type="slide" axis="0 0 1" class="free"/>
      <joint name="rooty" type="hinge" axis="0 1 0" class="free"/>
      <geom name="torso" type="capsule" fromto="-.5 0 0 .5 0 0" size="0.046"/>
      <geom name="head" type="capsule" pos=".6 0 .1" euler="0 50 0" size="0.046 .15"/>
      <body name="bthigh" pos="-.5 0 0">
        <joint name="bthigh" range="-30 60" stiffness="240" damping="6"/>
        <geom name="bthigh" type="capsule" pos=".1 0 -.13" euler="0 -218 0" size="0.046 .145"/>
        <body name="bshin" pos=".16 0 -.25">
          <joint name="bshin" range="-50 50" stiffness="180" damping="4.5"/>
          <geom name="bshin" type="capsule" pos="-.14 0 -.07" euler="0 -116 0" size="0.046 .15"/>
          <body name="bfoot" pos="-.28 0 -.14">
            <joint name="bfoot" range="-230 50" stiffness="120" damping="3"/>
            <geom name="bfoot" type="capsule" pos=".03 0 -.097" euler="0 -15 0" size="0.046 .094"/>
          </body>
        </body>
      </body>
      <body name="fthigh" pos=".5 0 0">
        <joint name="fthigh" range="-57 .40" stiffness="180" damping="4.5"/>
        <geom name="fthigh" type="capsule" pos="-.07 0 -.12" euler="0 30 0" size="0.046 .133"/>
        <body name="fshin" pos="-.14 0 -.24">
          <joint name="fshin" range="-70 50" stiffness="120" damping="3"/>
          <geom name="fshin" type="capsule" pos=".065 0 -.09" euler="0 -34 0" size="0.046 .106"/>
          <body name="ffoot" pos=".13 0 -.18">
            <joint name="ffoot" range="-28 28" stiffness="60" damping="1.5"/>
            <geom name="ffoot" type="capsule" pos=".045 0 -.07" euler="0 -34 0" size="0.046 .07"/>
          </body>
        </body>
      </body>
    </body>
  </worldbody>
  <sensor>
    <subtreelinvel name="torso_subtreelinvel" body="torso"/>
  </sensor>
  <actuator>
    <motor name="bthigh" joint="bthigh" gear="120" />
    <motor name="bshin" joint="bshin" gear="90" />
    <motor name="bfoot" joint="bfoot" gear="60" />
    <motor name="fthigh" joint="fthigh" gear="90" />
    <motor name="fshin" joint="fshin" gear="60" />
    <motor name="ffoot" joint="ffoot" gear="30" />
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/common/init.py
+++ b/Dreamer/local_dm_control_suite/common/init.py
@ -0,0 +1,39 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Functions to manage the common assets for domains."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 from dm_control.utils import io as resources
 _SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
 _FILENAMES = [
    "./common/materials.xml",
    "./common/materials_white_floor.xml",
    "./common/skybox.xml",
    "./common/visual.xml",
 ]
 ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
          for filename in _FILENAMES}
 def read_model(model_filename):
  """Reads a model XML file and returns its contents as a string."""
  return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
--- a/Dreamer/local_dm_control_suite/common/materials.xml
+++ b/Dreamer/local_dm_control_suite/common/materials.xml
@ -0,0 +1,23 @@
 <!--
 Common textures, colors and materials to be used throughout this suite. Some
 materials such as xxx_highlight are activated on occurence of certain events,
 for example receiving a positive reward.
 -->
 <mujoco>
  <asset>
    <texture name="grid" type="2d" builtin="checker" rgb1=".1 .1 .4" rgb2=".2 .2 .8" width="300" height="300" mark="edge" markrgb=".1 .1 .4"/>
    <material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
    <material name="self" rgba=".7 .5 .3 1"/>
    <material name="self_default" rgba=".7 .5 .3 1"/>
    <material name="self_highlight" rgba="0 .5 .3 1"/>
    <material name="effector" rgba=".7 .4 .2 1"/>
    <material name="effector_default" rgba=".7 .4 .2 1"/>
    <material name="effector_highlight" rgba="0 .5 .3 1"/>
    <material name="decoration" rgba=".7 .5 .3 1"/>
    <material name="eye" rgba="0 .2 1 1"/>
    <material name="target" rgba=".6 .3 .3 1"/>
    <material name="target_default" rgba=".6 .3 .3 1"/>
    <material name="target_highlight" rgba=".6 .3 .3 .4"/>
    <material name="site" rgba=".5 .5 .5 .3"/>
  </asset>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/common/materials_white_floor.xml
+++ b/Dreamer/local_dm_control_suite/common/materials_white_floor.xml
@ -0,0 +1,23 @@
 <!--
 Common textures, colors and materials to be used throughout this suite. Some
 materials such as xxx_highlight are activated on occurence of certain events,
 for example receiving a positive reward.
 -->
 <mujoco>
  <asset>
    <texture name="grid" type="2d" builtin="checker" rgb1=".1 .1 .1" rgb2=".2 .2 .2" width="300" height="300" mark="edge" markrgb=".1 .1 .1"/>
    <material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
    <material name="self" rgba=".7 .5 .3 1"/>
    <material name="self_default" rgba=".7 .5 .3 1"/>
    <material name="self_highlight" rgba="0 .5 .3 1"/>
    <material name="effector" rgba=".7 .4 .2 1"/>
    <material name="effector_default" rgba=".7 .4 .2 1"/>
    <material name="effector_highlight" rgba="0 .5 .3 1"/>
    <material name="decoration" rgba=".3 .5 .7 1"/>
    <material name="eye" rgba="0 .2 1 1"/>
    <material name="target" rgba=".6 .3 .3 1"/>
    <material name="target_default" rgba=".6 .3 .3 1"/>
    <material name="target_highlight" rgba=".6 .3 .3 .4"/>
    <material name="site" rgba=".5 .5 .5 .3"/>
  </asset>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/common/skybox.xml
+++ b/Dreamer/local_dm_control_suite/common/skybox.xml
@ -0,0 +1,6 @@
 <mujoco>
  <asset>
      <texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0"
               width="800" height="800" mark="random" markrgb="0 0 0"/>
  </asset>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/common/visual.xml
+++ b/Dreamer/local_dm_control_suite/common/visual.xml
@ -0,0 +1,7 @@
 <mujoco>
  <visual>
    <headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1"/>
    <map znear=".01"/>
    <quality shadowsize="2048"/>
  </visual>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/demos/mocap_demo.py
+++ b/Dreamer/local_dm_control_suite/demos/mocap_demo.py
@ -0,0 +1,84 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Demonstration of amc parsing for CMU mocap database.
 To run the demo, supply a path to a `.amc` file:
    python mocap_demo --filename='path/to/mocap.amc'
 CMU motion capture clips are available at mocap.cs.cmu.edu
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import time
 # Internal dependencies.
 from absl import app
 from absl import flags
 from local_dm_control_suite import humanoid_CMU
 from dm_control.suite.utils import parse_amc
 import matplotlib.pyplot as plt
 import numpy as np
 FLAGS = flags.FLAGS
 flags.DEFINE_string('filename', None, 'amc file to be converted.')
 flags.DEFINE_integer('max_num_frames', 90,
                     'Maximum number of frames for plotting/playback')
 def main(unused_argv):
  env = humanoid_CMU.stand()
  # Parse and convert specified clip.
  converted = parse_amc.convert(FLAGS.filename,
                                env.physics, env.control_timestep())
  max_frame = min(FLAGS.max_num_frames, converted.qpos.shape[1] - 1)
  width = 480
  height = 480
  video = np.zeros((max_frame, height, 2 * width, 3), dtype=np.uint8)
  for i in range(max_frame):
    p_i = converted.qpos[:, i]
    with env.physics.reset_context():
      env.physics.data.qpos[:] = p_i
    video[i] = np.hstack([env.physics.render(height, width, camera_id=0),
                          env.physics.render(height, width, camera_id=1)])
  tic = time.time()
  for i in range(max_frame):
    if i == 0:
      img = plt.imshow(video[i])
    else:
      img.set_data(video[i])
    toc = time.time()
    clock_dt = toc - tic
    tic = time.time()
    # Real-time playback not always possible as clock_dt > .03
    plt.pause(max(0.01, 0.03 - clock_dt))  # Need min display time > 0.0.
    plt.draw()
  plt.waitforbuttonpress()
 if __name__ == '__main__':
  flags.mark_flag_as_required('filename')
  app.run(main)
--- a/Dreamer/local_dm_control_suite/demos/zeros.amc
+++ b/Dreamer/local_dm_control_suite/demos/zeros.amc
@ -0,0 +1,213 @@
 #DUMMY AMC for testing
 :FULLY-SPECIFIED
 :DEGREES
 1
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 2
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 3
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 4
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 5
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 6
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
 7
 root 0 0 0 0 0 0
 lowerback 0 0 0
 upperback 0 0 0
 thorax 0 0 0
 lowerneck 0 0 0
 upperneck 0 0 0
 head 0 0 0
 rclavicle 0 0
 rhumerus 0 0 0
 rradius 0
 rwrist 0
 rhand 0 0
 rfingers 0
 rthumb 0 0
 lclavicle 0 0
 lhumerus 0 0 0
 lradius 0
 lwrist 0
 lhand 0 0
 lfingers 0
 lthumb 0 0
 rfemur 0 0 0
 rtibia 0
 rfoot 0 0
 rtoes 0
 lfemur 0 0 0
 ltibia 0
 lfoot 0 0
 ltoes 0
--- a/Dreamer/local_dm_control_suite/explore.py
+++ b/Dreamer/local_dm_control_suite/explore.py
@ -0,0 +1,84 @@
 # Copyright 2018 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Control suite environments explorer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from absl import app
 from absl import flags
 from dm_control import suite
 from dm_control.suite.wrappers import action_noise
 from six.moves import input
 from dm_control import viewer
 _ALL_NAMES = ['.'.join(domain_task) for domain_task in suite.ALL_TASKS]
 flags.DEFINE_enum('environment_name', None, _ALL_NAMES,
                  'Optional \'domain_name.task_name\' pair specifying the '
                  'environment to load. If unspecified a prompt will appear to '
                  'select one.')
 flags.DEFINE_bool('timeout', True, 'Whether episodes should have a time limit.')
 flags.DEFINE_bool('visualize_reward', True,
                  'Whether to vary the colors of geoms according to the '
                  'current reward value.')
 flags.DEFINE_float('action_noise', 0.,
                   'Standard deviation of Gaussian noise to apply to actions, '
                   'expressed as a fraction of the max-min range for each '
                   'action dimension. Defaults to 0, i.e. no noise.')
 FLAGS = flags.FLAGS
 def prompt_environment_name(prompt, values):
  environment_name = None
  while not environment_name:
    environment_name = input(prompt)
    if not environment_name or values.index(environment_name) < 0:
      print('"%s" is not a valid environment name.' % environment_name)
      environment_name = None
  return environment_name
 def main(argv):
  del argv
  environment_name = FLAGS.environment_name
  if environment_name is None:
    print('\n  '.join(['Available environments:'] + _ALL_NAMES))
    environment_name = prompt_environment_name(
        'Please select an environment name: ', _ALL_NAMES)
  index = _ALL_NAMES.index(environment_name)
  domain_name, task_name = suite.ALL_TASKS[index]
  task_kwargs = {}
  if not FLAGS.timeout:
    task_kwargs['time_limit'] = float('inf')
  def loader():
    env = suite.load(
        domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs)
    env.task.visualize_reward = FLAGS.visualize_reward
    if FLAGS.action_noise > 0:
      env = action_noise.Wrapper(env, scale=FLAGS.action_noise)
    return env
  viewer.launch(loader)
 if __name__ == '__main__':
  app.run(main)
--- a/Dreamer/local_dm_control_suite/finger.py
+++ b/Dreamer/local_dm_control_suite/finger.py
@ -0,0 +1,217 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Finger Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 import numpy as np
 from six.moves import range
 _DEFAULT_TIME_LIMIT = 20  # (seconds)
 _CONTROL_TIMESTEP = .02   # (seconds)
 # For TURN tasks, the 'tip' geom needs to enter a spherical target of sizes:
 _EASY_TARGET_SIZE = 0.07
 _HARD_TARGET_SIZE = 0.03
 # Initial spin velocity for the Stop task.
 _INITIAL_SPIN_VELOCITY = 100
 # Spinning slower than this value (radian/second) is considered stopped.
 _STOP_VELOCITY = 1e-6
 # Spinning faster than this value (radian/second) is considered spinning.
 _SPIN_VELOCITY = 15.0
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('finger.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def spin(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Spin task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Spin(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def turn_easy(time_limit=_DEFAULT_TIME_LIMIT, random=None,
              environment_kwargs=None):
  """Returns the easy Turn task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Turn(target_radius=_EASY_TARGET_SIZE, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def turn_hard(time_limit=_DEFAULT_TIME_LIMIT, random=None,
              environment_kwargs=None):
  """Returns the hard Turn task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Turn(target_radius=_HARD_TARGET_SIZE, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Finger domain."""
  def touch(self):
    """Returns logarithmically scaled signals from the two touch sensors."""
    return np.log1p(self.named.data.sensordata[['touchtop', 'touchbottom']])
  def hinge_velocity(self):
    """Returns the velocity of the hinge joint."""
    return self.named.data.sensordata['hinge_velocity']
  def tip_position(self):
    """Returns the (x,z) position of the tip relative to the hinge."""
    return (self.named.data.sensordata['tip'][[0, 2]] -
            self.named.data.sensordata['spinner'][[0, 2]])
  def bounded_position(self):
    """Returns the positions, with the hinge angle replaced by tip position."""
    return np.hstack((self.named.data.sensordata[['proximal', 'distal']],
                      self.tip_position()))
  def velocity(self):
    """Returns the velocities (extracted from sensordata)."""
    return self.named.data.sensordata[['proximal_velocity',
                                       'distal_velocity',
                                       'hinge_velocity']]
  def target_position(self):
    """Returns the (x,z) position of the target relative to the hinge."""
    return (self.named.data.sensordata['target'][[0, 2]] -
            self.named.data.sensordata['spinner'][[0, 2]])
  def to_target(self):
    """Returns the vector from the tip to the target."""
    return self.target_position() - self.tip_position()
  def dist_to_target(self):
    """Returns the signed distance to the target surface, negative is inside."""
    return (np.linalg.norm(self.to_target()) -
            self.named.model.site_size['target', 0])
 class Spin(base.Task):
  """A Finger `Task` to spin the stopped body."""
  def __init__(self, random=None):
    """Initializes a new `Spin` instance.
    Args:
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    super(Spin, self).__init__(random=random)
  def initialize_episode(self, physics):
    physics.named.model.site_rgba['target', 3] = 0
    physics.named.model.site_rgba['tip', 3] = 0
    physics.named.model.dof_damping['hinge'] = .03
    _set_random_joint_angles(physics, self.random)
    super(Spin, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns state and touch sensors, and target info."""
    obs = collections.OrderedDict()
    obs['position'] = physics.bounded_position()
    obs['velocity'] = physics.velocity()
    obs['touch'] = physics.touch()
    return obs
  def get_reward(self, physics):
    """Returns a sparse reward."""
    return float(physics.hinge_velocity() <= -_SPIN_VELOCITY)
 class Turn(base.Task):
  """A Finger `Task` to turn the body to a target angle."""
  def __init__(self, target_radius, random=None):
    """Initializes a new `Turn` instance.
    Args:
      target_radius: Radius of the target site, which specifies the goal angle.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._target_radius = target_radius
    super(Turn, self).__init__(random=random)
  def initialize_episode(self, physics):
    target_angle = self.random.uniform(-np.pi, np.pi)
    hinge_x, hinge_z = physics.named.data.xanchor['hinge', ['x', 'z']]
    radius = physics.named.model.geom_size['cap1'].sum()
    target_x = hinge_x + radius * np.sin(target_angle)
    target_z = hinge_z + radius * np.cos(target_angle)
    physics.named.model.site_pos['target', ['x', 'z']] = target_x, target_z
    physics.named.model.site_size['target', 0] = self._target_radius
    _set_random_joint_angles(physics, self.random)
    super(Turn, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns state, touch sensors, and target info."""
    obs = collections.OrderedDict()
    obs['position'] = physics.bounded_position()
    obs['velocity'] = physics.velocity()
    obs['touch'] = physics.touch()
    obs['target_position'] = physics.target_position()
    obs['dist_to_target'] = physics.dist_to_target()
    return obs
  def get_reward(self, physics):
    return float(physics.dist_to_target() <= 0)
 def _set_random_joint_angles(physics, random, max_attempts=1000):
  """Sets the joints to a random collision-free state."""
  for _ in range(max_attempts):
    randomizers.randomize_limited_and_rotational_joints(physics, random)
    # Check for collisions.
    physics.after_reset()
    if physics.data.ncon == 0:
      break
  else:
    raise RuntimeError('Could not find a collision-free state '
                       'after {} attempts'.format(max_attempts))
--- a/Dreamer/local_dm_control_suite/finger.xml
+++ b/Dreamer/local_dm_control_suite/finger.xml
@ -0,0 +1,72 @@
 <mujoco model="finger">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.01" cone="elliptic" iterations="200">
    <flag gravity="disable"/>
  </option>
  <default>
    <geom solimp="0 0.9 0.01" solref=".02 1"/>
    <joint type="hinge" axis="0 -1 0"/>
    <motor ctrllimited="true" ctrlrange="-1 1"/>
    <default class="finger">
      <joint damping="2.5" limited="true"/>
      <site type="ellipsoid" size=".025 .03 .025" material="site" group="3"/>
    </default>
  </default>
  <worldbody>
    <light name="light" directional="true" diffuse=".6 .6 .6" pos="0 0 2" specular=".3 .3 .3"/>
    <geom name="ground" type="plane" pos="0 0 0" size=".6 .2 10" material="grid"/>
    <camera name="cam0" pos="0 -1 .8" xyaxes="1 0 0 0 1 2"/>
    <camera name="cam1" pos="0 -1 .4" xyaxes="1 0 0 0 0 1" />
    <body name="proximal" pos="-.2 0 .4" childclass="finger">
      <geom name="proximal_decoration" type="cylinder" fromto="0 -.033 0 0 .033 0" size=".034" material="decoration"/>
      <joint name="proximal" range="-110 110" ref="-90"/>
      <geom name="proximal" type="capsule" material="self" size=".03" fromto="0 0 0 0 0 -.17"/>
      <body name="distal" pos="0 0 -.18" childclass="finger">
        <joint name="distal" range="-110 110"/>
        <geom name="distal" type="capsule" size=".028" material="self" fromto="0 0 0 0 0 -.16" contype="0" conaffinity="0"/>
        <geom name="fingertip" type="capsule" size=".03" material="effector" fromto="0 0 -.13 0 0 -.161"/>
        <site name="touchtop" pos=".01 0 -.17"/>
        <site name="touchbottom" pos="-.01 0 -.17"/>
      </body>
    </body>
    <body name="spinner" pos=".2 0 .4">
      <joint name="hinge" frictionloss=".1" damping=".5"/>
      <geom name="cap1" type="capsule" size=".04 .09" material="self" pos=".02 0 0"/>
      <geom name="cap2" type="capsule" size=".04 .09" material="self" pos="-.02 0 0"/>
      <site name="tip" type="sphere"  size=".02" pos="0 0 .13" material="target"/>
      <geom name="spinner_decoration" type="cylinder" fromto="0 -.045 0 0 .045 0" size=".02" material="decoration"/>
    </body>
    <site name="target" type="sphere" size=".03" pos="0 0 .4" material="target"/>
  </worldbody>
  <actuator>
    <motor name="proximal" joint="proximal" gear="30"/>
    <motor name="distal" joint="distal" gear="15"/>
  </actuator>
  <!-- All finger observations are functions of sensors. This is useful for finite-differencing. -->
  <sensor>
    <jointpos name="proximal" joint="proximal"/>
    <jointpos name="distal" joint="distal"/>
    <jointvel name="proximal_velocity" joint="proximal"/>
    <jointvel name="distal_velocity" joint="distal"/>
    <jointvel name="hinge_velocity" joint="hinge"/>
    <framepos name="tip" objtype="site" objname="tip"/>
    <framepos name="target" objtype="site" objname="target"/>
    <framepos name="spinner" objtype="xbody" objname="spinner"/>
    <touch name="touchtop" site="touchtop"/>
    <touch name="touchbottom" site="touchbottom"/>
    <framepos name="touchtop_pos" objtype="site" objname="touchtop"/>
    <framepos name="touchbottom_pos" objtype="site" objname="touchbottom"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/fish.py
+++ b/Dreamer/local_dm_control_suite/fish.py
@ -0,0 +1,176 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Fish Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 40
 _CONTROL_TIMESTEP = .04
 _JOINTS = ['tail1',
           'tail_twist',
           'tail2',
           'finright_roll',
           'finright_pitch',
           'finleft_roll',
           'finleft_pitch']
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('fish.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def upright(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns the Fish Upright task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Upright(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def swim(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Fish Swim task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Swim(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Fish domain."""
  def upright(self):
    """Returns projection from z-axes of torso to the z-axes of worldbody."""
    return self.named.data.xmat['torso', 'zz']
  def torso_velocity(self):
    """Returns velocities and angular velocities of the torso."""
    return self.data.sensordata
  def joint_velocities(self):
    """Returns the joint velocities."""
    return self.named.data.qvel[_JOINTS]
  def joint_angles(self):
    """Returns the joint positions."""
    return self.named.data.qpos[_JOINTS]
  def mouth_to_target(self):
    """Returns a vector, from mouth to target in local coordinate of mouth."""
    data = self.named.data
    mouth_to_target_global = data.geom_xpos['target'] - data.geom_xpos['mouth']
    return mouth_to_target_global.dot(data.geom_xmat['mouth'].reshape(3, 3))
 class Upright(base.Task):
  """A Fish `Task` for getting the torso upright with smooth reward."""
  def __init__(self, random=None):
    """Initializes an instance of `Upright`.
    Args:
      random: Either an existing `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically.
    """
    super(Upright, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Randomizes the tail and fin angles and the orientation of the Fish."""
    quat = self.random.randn(4)
    physics.named.data.qpos['root'][3:7] = quat / np.linalg.norm(quat)
    for joint in _JOINTS:
      physics.named.data.qpos[joint] = self.random.uniform(-.2, .2)
    # Hide the target. It's irrelevant for this task.
    physics.named.model.geom_rgba['target', 3] = 0
    super(Upright, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of joint angles, velocities and uprightness."""
    obs = collections.OrderedDict()
    obs['joint_angles'] = physics.joint_angles()
    obs['upright'] = physics.upright()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a smooth reward."""
    return rewards.tolerance(physics.upright(), bounds=(1, 1), margin=1)
 class Swim(base.Task):
  """A Fish `Task` for swimming with smooth reward."""
  def __init__(self, random=None):
    """Initializes an instance of `Swim`.
    Args:
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    super(Swim, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    quat = self.random.randn(4)
    physics.named.data.qpos['root'][3:7] = quat / np.linalg.norm(quat)
    for joint in _JOINTS:
      physics.named.data.qpos[joint] = self.random.uniform(-.2, .2)
    # Randomize target position.
    physics.named.model.geom_pos['target', 'x'] = self.random.uniform(-.4, .4)
    physics.named.model.geom_pos['target', 'y'] = self.random.uniform(-.4, .4)
    physics.named.model.geom_pos['target', 'z'] = self.random.uniform(.1, .3)
    super(Swim, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of joints, target direction and velocities."""
    obs = collections.OrderedDict()
    obs['joint_angles'] = physics.joint_angles()
    obs['upright'] = physics.upright()
    obs['target'] = physics.mouth_to_target()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a smooth reward."""
    radii = physics.named.model.geom_size[['mouth', 'target'], 0].sum()
    in_target = rewards.tolerance(np.linalg.norm(physics.mouth_to_target()),
                                  bounds=(0, radii), margin=2*radii)
    is_upright = 0.5 * (physics.upright() + 1)
    return (7*in_target + is_upright) / 8
--- a/Dreamer/local_dm_control_suite/fish.xml
+++ b/Dreamer/local_dm_control_suite/fish.xml
@ -0,0 +1,85 @@
 <mujoco model="fish">
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <asset>
      <texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0" width="800" height="800" mark="random" markrgb="1 1 1"/>
  </asset>
  <option timestep="0.004" density="5000">
    <flag gravity="disable" constraint="disable"/>
  </option>
  <default>
    <general ctrllimited="true"/>
    <default class="fish">
      <joint type="hinge" limited="false" range="-60 60" damping="2e-5" solreflimit=".1 1" solimplimit="0 .8 .1"/>
      <geom material="self"/>
    </default>
  </default>
  <worldbody>
    <camera name="tracking_top" pos="0 0 1" xyaxes="1 0 0 0 1 0" mode="trackcom"/>
    <camera name="tracking_x" pos="-.3 0 .2" xyaxes="0 -1 0 0.342 0 0.940" fovy="60" mode="trackcom"/>
    <camera name="tracking_y" pos="0 -.3 .2" xyaxes="1 0 0 0 0.342 0.940" fovy="60" mode="trackcom"/>
    <camera name="fixed_top" pos="0 0 5.5" fovy="10"/>
    <geom name="ground" type="plane" size=".5 .5 .1" material="grid"/>
    <geom name="target" type="sphere" pos="0 .4 .1" size=".04" material="target"/>
    <body name="torso" pos="0 0 .1" childclass="fish">
      <light name="light" diffuse=".6 .6 .6" pos="0 0 0.5" dir="0 0 -1" specular=".3 .3 .3" mode="track"/>
      <joint name="root" type="free" damping="0" limited="false"/>
      <site name="torso" size=".01" rgba="0 0 0 0"/>
      <geom name="eye" type="ellipsoid" pos="0 .055 .015" size=".008 .012 .008" euler="-10 0 0" material="eye" mass="0"/>
      <camera name="eye" pos="0 .06 .02" xyaxes="1 0 0 0 0 1"/>
      <geom name="mouth" type="capsule" fromto="0 .079 0 0 .07 0" size=".005" material="effector" mass="0"/>
      <geom name="lower_mouth" type="capsule" fromto="0 .079 -.004 0 .07 -.003" size=".0045" material="effector" mass="0"/>
      <geom name="torso" type="ellipsoid" size=".01 .08 .04" mass="0"/>
      <geom name="back_fin" type="ellipsoid" size=".001 .03 .015" pos="0 -.03 .03" material="effector" mass="0"/>
      <geom name="torso_massive" type="box" size=".002 .06 .03" group="4"/>
      <body name="tail1" pos="0 -.09 0">
        <joint name="tail1" axis="0 0 1" pos="0 .01 0"/>
        <joint name="tail_twist" axis="0 1 0" pos="0 .01 0" range="-30 30"/>
        <geom name="tail1" type="ellipsoid" size=".001 .008 .016"/>
        <body name="tail2" pos="0 -.028 0">
          <joint name="tail2" axis="0 0 1" pos="0 .02 0" stiffness="8e-5"/>
          <geom name="tail2" type="ellipsoid" size=".001 .018 .035"/>
        </body>
      </body>
      <body name="finright" pos=".01 0 0">
        <joint name="finright_roll" axis="0 1 0"/>
        <joint name="finright_pitch" axis="1 0 0" pos="0 .005 0"/>
        <geom name="finright" type="ellipsoid" pos=".015 0 0" size=".02 .015 .001"  />
      </body>
      <body name="finleft" pos="-.01 0 0">
        <joint name="finleft_roll" axis="0 1 0"/>
        <joint name="finleft_pitch" axis="1 0 0" pos="0 .005 0"/>
        <geom name="finleft" type="ellipsoid"  pos="-.015 0 0" size=".02 .015 .001"/>
      </body>
    </body>
  </worldbody>
  <tendon>
    <fixed name="fins_flap">
      <joint joint="finleft_roll"  coef="-.5"/>
      <joint joint="finright_roll" coef=".5"/>
    </fixed>
    <fixed name="fins_sym" stiffness="1e-4">
      <joint joint="finleft_roll"  coef=".5"/>
      <joint joint="finright_roll" coef=".5"/>
    </fixed>
  </tendon>
  <actuator>
    <position name="tail"           joint="tail1"           ctrlrange="-1 1"    kp="5e-4"/>
    <position name="tail_twist"     joint="tail_twist"      ctrlrange="-1 1"    kp="1e-4"/>
    <position name="fins_flap"      tendon="fins_flap"      ctrlrange="-1 1"    kp="3e-4"/>
    <position name="finleft_pitch"  joint="finleft_pitch"   ctrlrange="-1 1"    kp="1e-4"/>
    <position name="finright_pitch" joint="finright_pitch"  ctrlrange="-1 1"    kp="1e-4"/>
  </actuator>
  <sensor>
    <velocimeter name="velocimeter" site="torso"/>
    <gyro name="gyro" site="torso"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/hopper.py
+++ b/Dreamer/local_dm_control_suite/hopper.py
@ -0,0 +1,138 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Hopper domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 SUITE = containers.TaggedTasks()
 _CONTROL_TIMESTEP = .02  # (Seconds)
 # Default duration of an episode, in seconds.
 _DEFAULT_TIME_LIMIT = 20
 # Minimal height of torso over foot above which stand reward is 1.
 _STAND_HEIGHT = 0.6
 # Hopping speed above which hop reward is 1.
 _HOP_SPEED = 2
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('hopper.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns a Hopper that strives to stand upright, balancing its pose."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Hopper(hopping=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def hop(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns a Hopper that strives to hop forward."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Hopper(hopping=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Hopper domain."""
  def height(self):
    """Returns height of torso with respect to foot."""
    return (self.named.data.xipos['torso', 'z'] -
            self.named.data.xipos['foot', 'z'])
  def speed(self):
    """Returns horizontal speed of the Hopper."""
    return self.named.data.sensordata['torso_subtreelinvel'][0]
  def touch(self):
    """Returns the signals from two foot touch sensors."""
    return np.log1p(self.named.data.sensordata[['touch_toe', 'touch_heel']])
 class Hopper(base.Task):
  """A Hopper's `Task` to train a standing and a jumping Hopper."""
  def __init__(self, hopping, random=None):
    """Initialize an instance of `Hopper`.
    Args:
      hopping: Boolean, if True the task is to hop forwards, otherwise it is to
        balance upright.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._hopping = hopping
    super(Hopper, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    randomizers.randomize_limited_and_rotational_joints(physics, self.random)
    self._timeout_progress = 0
    super(Hopper, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of positions, velocities and touch sensors."""
    obs = collections.OrderedDict()
    # Ignores horizontal position to maintain translational invariance:
    obs['position'] = physics.data.qpos[1:].copy()
    obs['velocity'] = physics.velocity()
    obs['touch'] = physics.touch()
    return obs
  def get_reward(self, physics):
    """Returns a reward applicable to the performed task."""
    standing = rewards.tolerance(physics.height(), (_STAND_HEIGHT, 2))
    if self._hopping:
      hopping = rewards.tolerance(physics.speed(),
                                  bounds=(_HOP_SPEED, float('inf')),
                                  margin=_HOP_SPEED/2,
                                  value_at_margin=0.5,
                                  sigmoid='linear')
      return standing * hopping
    else:
      small_control = rewards.tolerance(physics.control(),
                                        margin=1, value_at_margin=0,
                                        sigmoid='quadratic').mean()
      small_control = (small_control + 4) / 5
      return standing * small_control
--- a/Dreamer/local_dm_control_suite/hopper.xml
+++ b/Dreamer/local_dm_control_suite/hopper.xml
@ -0,0 +1,66 @@
 <mujoco model="planar hopper">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials_white_floor.xml"/>
  <statistic extent="2" center="0 0 .5"/>
  <default>
    <default class="hopper">
      <joint type="hinge" axis="0 1 0" limited="true" damping=".05" armature=".2"/>
      <geom type="capsule" material="self"/>
      <site type="sphere" size="0.05" group="3"/>
    </default>
    <default class="free">
      <joint limited="false" damping="0" armature="0" stiffness="0"/>
    </default>
    <motor ctrlrange="-1 1" ctrllimited="true"/>
  </default>
  <option timestep="0.005"/>
  <worldbody>
    <camera name="cam0" pos="0 -2.8 0.8" euler="90 0 0" mode="trackcom"/>
    <camera name="back" pos="-2 -.2 1.2" xyaxes="0.2 -1 0 .5 0 2" mode="trackcom"/>
    <geom name="floor" type="plane" conaffinity="1" pos="48 0 0" size="50 1 .2" material="grid"/>
    <body name="torso" pos="0 0 1" childclass="hopper">
      <light name="top" pos="0 0 2" mode="trackcom"/>
      <joint name="rootx" type="slide" axis="1 0 0" class="free"/>
      <joint name="rootz" type="slide" axis="0 0 1" class="free"/>
      <joint name="rooty" type="hinge" axis="0 1 0" class="free"/>
      <geom name="torso" fromto="0 0 -.05 0 0 .2" size="0.0653"/>
      <geom name="nose" fromto=".08 0 .13 .15 0 .14" size="0.03"/>
      <body name="pelvis" pos="0 0 -.05">
        <joint name="waist" range="-30 30"/>
        <geom name="pelvis" fromto="0 0 0 0 0 -.15" size="0.065"/>
        <body name="thigh" pos="0 0 -.2">
          <joint name="hip" range="-170 10"/>
          <geom name="thigh" fromto="0 0 0 0 0 -.33" size="0.04"/>
          <body name="calf" pos="0 0 -.33">
            <joint name="knee" range="5 150"/>
            <geom name="calf" fromto="0 0 0 0 0 -.32" size="0.03"/>
            <body name="foot" pos="0 0 -.32">
              <joint name="ankle" range="-45 45"/>
              <geom name="foot" fromto="-.08 0 0 .17 0 0" size="0.04"/>
              <site name="touch_toe" pos=".17 0 0"/>
              <site name="touch_heel" pos="-.08 0 0"/>
            </body>
          </body>
        </body>
      </body>
    </body>
  </worldbody>
  <sensor>
    <subtreelinvel name="torso_subtreelinvel" body="torso"/>
    <touch name="touch_toe" site="touch_toe"/>
    <touch name="touch_heel" site="touch_heel"/>
  </sensor>
  <actuator>
    <motor name="waist" joint="waist" gear="30"/>
    <motor name="hip" joint="hip" gear="40"/>
    <motor name="knee" joint="knee" gear="30"/>
    <motor name="ankle" joint="ankle" gear="10"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/humanoid.py
+++ b/Dreamer/local_dm_control_suite/humanoid.py
@ -0,0 +1,211 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Humanoid Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 25
 _CONTROL_TIMESTEP = .025
 # Height of head above which stand reward is 1.
 _STAND_HEIGHT = 1.4
 # Horizontal speeds above which move reward is 1.
 _WALK_SPEED = 1
 _RUN_SPEED = 10
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('humanoid.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Stand task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Humanoid(move_speed=0, pure_state=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Humanoid(move_speed=_WALK_SPEED, pure_state=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Run task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Humanoid(move_speed=_RUN_SPEED, pure_state=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add()
 def run_pure_state(time_limit=_DEFAULT_TIME_LIMIT, random=None,
                   environment_kwargs=None):
  """Returns the Run task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Humanoid(move_speed=_RUN_SPEED, pure_state=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Walker domain."""
  def torso_upright(self):
    """Returns projection from z-axes of torso to the z-axes of world."""
    return self.named.data.xmat['torso', 'zz']
  def head_height(self):
    """Returns the height of the torso."""
    return self.named.data.xpos['head', 'z']
  def center_of_mass_position(self):
    """Returns position of the center-of-mass."""
    return self.named.data.subtree_com['torso'].copy()
  def center_of_mass_velocity(self):
    """Returns the velocity of the center-of-mass."""
    return self.named.data.sensordata['torso_subtreelinvel'].copy()
  def torso_vertical_orientation(self):
    """Returns the z-projection of the torso orientation matrix."""
    return self.named.data.xmat['torso', ['zx', 'zy', 'zz']]
  def joint_angles(self):
    """Returns the state without global orientation or position."""
    return self.data.qpos[7:].copy()  # Skip the 7 DoFs of the free root joint.
  def extremities(self):
    """Returns end effector positions in egocentric frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    positions = []
    for side in ('left_', 'right_'):
      for limb in ('hand', 'foot'):
        torso_to_limb = self.named.data.xpos[side + limb] - torso_pos
        positions.append(torso_to_limb.dot(torso_frame))
    return np.hstack(positions)
 class Humanoid(base.Task):
  """A humanoid task."""
  def __init__(self, move_speed, pure_state, random=None):
    """Initializes an instance of `Humanoid`.
    Args:
      move_speed: A float. If this value is zero, reward is given simply for
        standing up. Otherwise this specifies a target horizontal velocity for
        the walking task.
      pure_state: A bool. Whether the observations consist of the pure MuJoCo
        state or includes some useful features thereof.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._move_speed = move_speed
    self._pure_state = pure_state
    super(Humanoid, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Find a collision-free random initial configuration.
    penetrating = True
    while penetrating:
      randomizers.randomize_limited_and_rotational_joints(physics, self.random)
      # Check for collisions.
      physics.after_reset()
      penetrating = physics.data.ncon > 0
    super(Humanoid, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns either the pure state or a set of egocentric features."""
    obs = collections.OrderedDict()
    if self._pure_state:
      obs['position'] = physics.position()
      obs['velocity'] = physics.velocity()
    else:
      obs['joint_angles'] = physics.joint_angles()
      obs['head_height'] = physics.head_height()
      obs['extremities'] = physics.extremities()
      obs['torso_vertical'] = physics.torso_vertical_orientation()
      obs['com_velocity'] = physics.center_of_mass_velocity()
      obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    standing = rewards.tolerance(physics.head_height(),
                                 bounds=(_STAND_HEIGHT, float('inf')),
                                 margin=_STAND_HEIGHT/4)
    upright = rewards.tolerance(physics.torso_upright(),
                                bounds=(0.9, float('inf')), sigmoid='linear',
                                margin=1.9, value_at_margin=0)
    stand_reward = standing * upright
    small_control = rewards.tolerance(physics.control(), margin=1,
                                      value_at_margin=0,
                                      sigmoid='quadratic').mean()
    small_control = (4 + small_control) / 5
    if self._move_speed == 0:
      horizontal_velocity = physics.center_of_mass_velocity()[[0, 1]]
      dont_move = rewards.tolerance(horizontal_velocity, margin=2).mean()
      return small_control * stand_reward * dont_move
    else:
      com_velocity = np.linalg.norm(physics.center_of_mass_velocity()[[0, 1]])
      move = rewards.tolerance(com_velocity,
                               bounds=(self._move_speed, float('inf')),
                               margin=self._move_speed, value_at_margin=0,
                               sigmoid='linear')
      move = (5*move + 1) / 6
      return small_control * stand_reward * move
--- a/Dreamer/local_dm_control_suite/humanoid.xml
+++ b/Dreamer/local_dm_control_suite/humanoid.xml
@ -0,0 +1,202 @@
 <mujoco model="humanoid">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <statistic extent="2" center="0 0 1"/>
  <option timestep=".005"/>
  <default>
    <motor ctrlrange="-1 1" ctrllimited="true"/>
    <default class="body">
      <geom  type="capsule" condim="1" friction=".7" solimp=".9 .99 .003" solref=".015 1" material="self"/>
      <joint type="hinge" damping=".2" stiffness="1" armature=".01" limited="true" solimplimit="0 .99 .01"/>
      <default class="big_joint">
        <joint damping="5" stiffness="10"/>
        <default class="big_stiff_joint">
          <joint stiffness="20"/>
        </default>
      </default>
      <site size=".04" group="3"/>
      <default class="force-torque">
        <site type="box" size=".01 .01 .02" rgba="1 0 0 1" />
      </default>
      <default class="touch">
        <site type="capsule" rgba="0 0 1 .3"/>
      </default>
    </default>
  </default>
  <worldbody>
    <geom name="floor" type="plane" conaffinity="1" size="100 100 .2" material="grid"/>
    <body name="torso" pos="0 0 1.5" childclass="body">
      <light name="top" pos="0 0 2" mode="trackcom"/>
      <camera name="back" pos="-3 0 1" xyaxes="0 -1 0 1 0 2" mode="trackcom"/>
      <camera name="side" pos="0 -3 1" xyaxes="1 0 0 0 1 2" mode="trackcom"/>
      <freejoint name="root"/>
      <site name="root" class="force-torque"/>
      <geom name="torso" fromto="0 -.07 0 0 .07 0" size=".07"/>
      <geom name="upper_waist" fromto="-.01 -.06 -.12 -.01 .06 -.12" size=".06"/>
      <site name="torso" class="touch" type="box" pos="0 0 -.05" size=".075 .14 .13"/>
      <body name="head" pos="0 0 .19">
        <geom name="head" type="sphere" size=".09"/>
        <site name="head" class="touch" type="sphere" size=".091"/>
        <camera name="egocentric" pos=".09 0 0" xyaxes="0 -1 0 .1 0 1" fovy="80"/>
      </body>
      <body name="lower_waist" pos="-.01 0 -.260" quat="1.000 0 -.002 0">
        <geom name="lower_waist" fromto="0 -.06 0 0 .06 0" size=".06"/>
        <site name="lower_waist" class="touch" size=".061 .06" zaxis="0 1 0"/>
        <joint name="abdomen_z" pos="0 0 .065" axis="0 0 1" range="-45 45" class="big_stiff_joint"/>
        <joint name="abdomen_y" pos="0 0 .065" axis="0 1 0" range="-75 30" class="big_joint"/>
        <body name="pelvis" pos="0 0 -.165" quat="1.000 0 -.002 0">
          <joint name="abdomen_x" pos="0 0 .1" axis="1 0 0" range="-35 35" class="big_joint"/>
          <geom name="butt" fromto="-.02 -.07 0 -.02 .07 0" size=".09"/>
          <site name="butt" class="touch" size=".091 .07" pos="-.02 0 0" zaxis="0 1 0"/>
          <body name="right_thigh" pos="0 -.1 -.04">
            <site name="right_hip" class="force-torque"/>
            <joint name="right_hip_x" axis="1 0 0" range="-25 5"   class="big_joint"/>
            <joint name="right_hip_z" axis="0 0 1" range="-60 35"  class="big_joint"/>
            <joint name="right_hip_y" axis="0 1 0" range="-110 20" class="big_stiff_joint"/>
            <geom name="right_thigh" fromto="0 0 0 0 .01 -.34" size=".06"/>
            <site name="right_thigh" class="touch" pos="0 .005 -.17" size=".061 .17" zaxis="0 -1 34"/>
            <body name="right_shin" pos="0 .01 -.403">
              <site name="right_knee" class="force-torque" pos="0 0 .02"/>
              <joint name="right_knee" pos="0 0 .02" axis="0 -1 0" range="-160 2"/>
              <geom name="right_shin" fromto="0 0 0 0 0 -.3"  size=".049"/>
              <site name="right_shin" class="touch" pos="0 0 -.15" size=".05 .15"/>
              <body name="right_foot" pos="0 0 -.39">
                <site name="right_ankle" class="force-torque"/>
                <joint name="right_ankle_y" pos="0 0 .08" axis="0 1 0"   range="-50 50" stiffness="6"/>
                <joint name="right_ankle_x" pos="0 0 .04" axis="1 0 .5" range="-50 50" stiffness="3"/>
                <geom name="right_right_foot" fromto="-.07 -.02 0 .14 -.04 0" size=".027"/>
                <geom name="left_right_foot" fromto="-.07 0 0 .14  .02 0" size=".027"/>
                <site name="right_right_foot" class="touch" pos=".035 -.03 0" size=".03 .11" zaxis="21 -2 0"/>
                <site name="left_right_foot" class="touch" pos=".035 .01 0" size=".03 .11" zaxis="21 2 0"/>
              </body>
            </body>
          </body>
          <body name="left_thigh" pos="0 .1 -.04">
            <site name="left_hip" class="force-torque"/>
            <joint name="left_hip_x" axis="-1 0 0" range="-25 5"  class="big_joint"/>
            <joint name="left_hip_z" axis="0 0 -1" range="-60 35" class="big_joint"/>
            <joint name="left_hip_y" axis="0 1 0" range="-120 20" class="big_stiff_joint"/>
            <geom name="left_thigh" fromto="0 0 0 0 -.01 -.34" size=".06"/>
            <site name="left_thigh" class="touch" pos="0 -.005 -.17" size=".061 .17" zaxis="0 1 34"/>
            <body name="left_shin" pos="0 -.01 -.403">
              <site name="left_knee" class="force-torque" pos="0 0 .02"/>
              <joint name="left_knee" pos="0 0 .02" axis="0 -1 0" range="-160 2"/>
              <geom name="left_shin" fromto="0 0 0 0 0 -.3"  size=".049"/>
              <site name="left_shin" class="touch"  pos="0 0 -.15" size=".05 .15"/>
              <body name="left_foot" pos="0 0 -.39">
                <site name="left_ankle" class="force-torque"/>
                <joint name="left_ankle_y" pos="0 0 .08" axis="0 1 0"   range="-50 50" stiffness="6"/>
                <joint name="left_ankle_x" pos="0 0 .04" axis="1 0 .5" range="-50 50" stiffness="3"/>
                <geom name="left_left_foot" fromto="-.07 .02 0 .14 .04 0" size=".027"/>
                <geom name="right_left_foot" fromto="-.07 0 0 .14  -.02 0" size=".027"/>
                <site name="right_left_foot" class="touch" pos=".035 -.01 0" size=".03 .11" zaxis="21 -2 0"/>
                <site name="left_left_foot" class="touch" pos=".035 .03 0" size=".03 .11" zaxis="21 2 0"/>
              </body>
            </body>
          </body>
        </body>
      </body>
      <body name="right_upper_arm" pos="0 -.17 .06">
        <joint name="right_shoulder1" axis="2 1 1"  range="-85 60"/>
        <joint name="right_shoulder2" axis="0 -1 1" range="-85 60"/>
        <geom name="right_upper_arm" fromto="0 0 0 .16 -.16 -.16" size=".04 .16"/>
        <site name="right_upper_arm" class="touch" pos=".08 -.08 -.08" size=".041 .14" zaxis="1 -1 -1"/>
        <body name="right_lower_arm" pos=".18 -.18 -.18">
          <joint name="right_elbow" axis="0 -1 1" range="-90 50" stiffness="0"/>
          <geom name="right_lower_arm" fromto=".01 .01 .01 .17 .17 .17" size=".031"/>
          <site name="right_lower_arm" class="touch" pos=".09 .09 .09" size=".032 .14" zaxis="1 1 1"/>
          <body name="right_hand" pos=".18 .18 .18">
            <geom name="right_hand" type="sphere" size=".04"/>
            <site name="right_hand" class="touch" type="sphere" size=".041"/>
          </body>
        </body>
      </body>
      <body name="left_upper_arm" pos="0 .17 .06">
        <joint name="left_shoulder1" axis="2 -1 1" range="-60 85"/>
        <joint name="left_shoulder2" axis="0 1 1" range="-60 85"/>
        <geom name="left_upper_arm" fromto="0 0 0 .16 .16 -.16" size=".04 .16"/>
        <site name="left_upper_arm" class="touch" pos=".08 .08 -.08" size=".041 .14" zaxis="1 1 -1"/>
        <body name="left_lower_arm" pos=".18 .18 -.18">
          <joint name="left_elbow" axis="0 -1 -1" range="-90 50" stiffness="0"/>
          <geom name="left_lower_arm" fromto=".01 -.01 .01 .17 -.17 .17" size=".031"/>
          <site name="left_lower_arm" class="touch" pos=".09 -.09 .09" size=".032 .14" zaxis="1 -1 1"/>
          <body name="left_hand" pos=".18 -.18 .18">
            <geom name="left_hand" type="sphere" size=".04"/>
            <site name="left_hand" class="touch" type="sphere" size=".041"/>
          </body>
        </body>
      </body>
    </body>
  </worldbody>
  <actuator>
    <motor name="abdomen_y"       gear="40"  joint="abdomen_y"/>
    <motor name="abdomen_z"       gear="40"  joint="abdomen_z"/>
    <motor name="abdomen_x"       gear="40"  joint="abdomen_x"/>
    <motor name="right_hip_x"     gear="40"  joint="right_hip_x"/>
    <motor name="right_hip_z"     gear="40"  joint="right_hip_z"/>
    <motor name="right_hip_y"     gear="120" joint="right_hip_y"/>
    <motor name="right_knee"      gear="80"  joint="right_knee"/>
    <motor name="right_ankle_x"   gear="20"  joint="right_ankle_x"/>
    <motor name="right_ankle_y"   gear="20"  joint="right_ankle_y"/>
    <motor name="left_hip_x"      gear="40"  joint="left_hip_x"/>
    <motor name="left_hip_z"      gear="40"  joint="left_hip_z"/>
    <motor name="left_hip_y"      gear="120" joint="left_hip_y"/>
    <motor name="left_knee"       gear="80"  joint="left_knee"/>
    <motor name="left_ankle_x"    gear="20"  joint="left_ankle_x"/>
    <motor name="left_ankle_y"    gear="20"  joint="left_ankle_y"/>
    <motor name="right_shoulder1" gear="20"  joint="right_shoulder1"/>
    <motor name="right_shoulder2" gear="20"  joint="right_shoulder2"/>
    <motor name="right_elbow"     gear="40"  joint="right_elbow"/>
    <motor name="left_shoulder1"  gear="20"  joint="left_shoulder1"/>
    <motor name="left_shoulder2"  gear="20"  joint="left_shoulder2"/>
    <motor name="left_elbow"      gear="40"  joint="left_elbow"/>
  </actuator>
  <sensor>
    <subtreelinvel name="torso_subtreelinvel" body="torso"/>
    <accelerometer name="torso_accel"    site="root"/>
    <velocimeter name="torso_vel"        site="root"/>
    <gyro name="torso_gyro"              site="root"/>
    <force name="left_ankle_force"       site="left_ankle"/>
    <force name="right_ankle_force"      site="right_ankle"/>
    <force name="left_knee_force"        site="left_knee"/>
    <force name="right_knee_force"       site="right_knee"/>
    <force name="left_hip_force"         site="left_hip"/>
    <force name="right_hip_force"        site="right_hip"/>
    <torque name="left_ankle_torque"     site="left_ankle"/>
    <torque name="right_ankle_torque"    site="right_ankle"/>
    <torque name="left_knee_torque"      site="left_knee"/>
    <torque name="right_knee_torque"     site="right_knee"/>
    <torque name="left_hip_torque"       site="left_hip"/>
    <torque name="right_hip_torque"      site="right_hip"/>
    <touch name="torso_touch"            site="torso"/>
    <touch name="head_touch"             site="head"/>
    <touch name="lower_waist_touch"      site="lower_waist"/>
    <touch name="butt_touch"             site="butt"/>
    <touch name="right_thigh_touch"      site="right_thigh"/>
    <touch name="right_shin_touch"       site="right_shin"/>
    <touch name="right_right_foot_touch" site="right_right_foot"/>
    <touch name="left_right_foot_touch"  site="left_right_foot"/>
    <touch name="left_thigh_touch"       site="left_thigh"/>
    <touch name="left_shin_touch"        site="left_shin"/>
    <touch name="right_left_foot_touch"  site="right_left_foot"/>
    <touch name="left_left_foot_touch"   site="left_left_foot"/>
    <touch name="right_upper_arm_touch"  site="right_upper_arm"/>
    <touch name="right_lower_arm_touch"  site="right_lower_arm"/>
    <touch name="right_hand_touch"       site="right_hand"/>
    <touch name="left_upper_arm_touch"   site="left_upper_arm"/>
    <touch name="left_lower_arm_touch"   site="left_lower_arm"/>
    <touch name="left_hand_touch"        site="left_hand"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/humanoid_CMU.py
+++ b/Dreamer/local_dm_control_suite/humanoid_CMU.py
@ -0,0 +1,179 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Humanoid_CMU Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 20
 _CONTROL_TIMESTEP = 0.02
 # Height of head above which stand reward is 1.
 _STAND_HEIGHT = 1.4
 # Horizontal speeds above which move reward is 1.
 _WALK_SPEED = 1
 _RUN_SPEED = 10
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('humanoid_CMU.xml'), common.ASSETS
@SUITE.add()
 def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Stand task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = HumanoidCMU(move_speed=0, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add()
 def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Run task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = HumanoidCMU(move_speed=_RUN_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the humanoid_CMU domain."""
  def thorax_upright(self):
    """Returns projection from y-axes of thorax to the z-axes of world."""
    return self.named.data.xmat['thorax', 'zy']
  def head_height(self):
    """Returns the height of the head."""
    return self.named.data.xpos['head', 'z']
  def center_of_mass_position(self):
    """Returns position of the center-of-mass."""
    return self.named.data.subtree_com['thorax']
  def center_of_mass_velocity(self):
    """Returns the velocity of the center-of-mass."""
    return self.named.data.sensordata['thorax_subtreelinvel'].copy()
  def torso_vertical_orientation(self):
    """Returns the z-projection of the thorax orientation matrix."""
    return self.named.data.xmat['thorax', ['zx', 'zy', 'zz']]
  def joint_angles(self):
    """Returns the state without global orientation or position."""
    return self.data.qpos[7:].copy()  # Skip the 7 DoFs of the free root joint.
  def extremities(self):
    """Returns end effector positions in egocentric frame."""
    torso_frame = self.named.data.xmat['thorax'].reshape(3, 3)
    torso_pos = self.named.data.xpos['thorax']
    positions = []
    for side in ('l', 'r'):
      for limb in ('hand', 'foot'):
        torso_to_limb = self.named.data.xpos[side + limb] - torso_pos
        positions.append(torso_to_limb.dot(torso_frame))
    return np.hstack(positions)
 class HumanoidCMU(base.Task):
  """A task for the CMU Humanoid."""
  def __init__(self, move_speed, random=None):
    """Initializes an instance of `Humanoid_CMU`.
    Args:
      move_speed: A float. If this value is zero, reward is given simply for
        standing up. Otherwise this specifies a target horizontal velocity for
        the walking task.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._move_speed = move_speed
    super(HumanoidCMU, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets a random collision-free configuration at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    penetrating = True
    while penetrating:
      randomizers.randomize_limited_and_rotational_joints(
          physics, self.random)
      # Check for collisions.
      physics.after_reset()
      penetrating = physics.data.ncon > 0
    super(HumanoidCMU, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns a set of egocentric features."""
    obs = collections.OrderedDict()
    obs['joint_angles'] = physics.joint_angles()
    obs['head_height'] = physics.head_height()
    obs['extremities'] = physics.extremities()
    obs['torso_vertical'] = physics.torso_vertical_orientation()
    obs['com_velocity'] = physics.center_of_mass_velocity()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    standing = rewards.tolerance(physics.head_height(),
                                 bounds=(_STAND_HEIGHT, float('inf')),
                                 margin=_STAND_HEIGHT/4)
    upright = rewards.tolerance(physics.thorax_upright(),
                                bounds=(0.9, float('inf')), sigmoid='linear',
                                margin=1.9, value_at_margin=0)
    stand_reward = standing * upright
    small_control = rewards.tolerance(physics.control(), margin=1,
                                      value_at_margin=0,
                                      sigmoid='quadratic').mean()
    small_control = (4 + small_control) / 5
    if self._move_speed == 0:
      horizontal_velocity = physics.center_of_mass_velocity()[[0, 1]]
      dont_move = rewards.tolerance(horizontal_velocity, margin=2).mean()
      return small_control * stand_reward * dont_move
    else:
      com_velocity = np.linalg.norm(physics.center_of_mass_velocity()[[0, 1]])
      move = rewards.tolerance(com_velocity,
                               bounds=(self._move_speed, float('inf')),
                               margin=self._move_speed, value_at_margin=0,
                               sigmoid='linear')
      move = (5*move + 1) / 6
      return small_control * stand_reward * move
--- a/Dreamer/local_dm_control_suite/humanoid_CMU.xml
+++ b/Dreamer/local_dm_control_suite/humanoid_CMU.xml
@ -0,0 +1,289 @@
 <mujoco model="humanoid_CMU">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <statistic extent="2" center="0 0 1"/>
  <default class="main">
    <joint limited="true" solimplimit="0 0.99 0.01" stiffness="0.1" armature=".01" damping="1"/>
    <geom friction="0.7" solref="0.015 1" solimp="0.95 0.99 0.003"/>
    <motor ctrllimited="true" ctrlrange="-1 1"/>
    <default class="humanoid">
      <geom type="capsule" material="self"/>
      <default class="stiff_low">
        <joint stiffness=".5" damping="4"/>
      </default>
      <default class="stiff_medium">
        <joint stiffness="10" damping="5"/>
      </default>
      <default class="stiff_high">
        <joint stiffness="30" damping="10"/>
      </default>
      <default class="touch">
        <site group="3" rgba="0 0 1 .5"/>
      </default>
    </default>
  </default>
  <worldbody>
    <geom name="floor" type="plane" conaffinity="1" size="100 100 .2" material="grid"/>
    <light name="tracking_light" pos="0 0 7" dir="0 0 -1" mode="trackcom"/>
    <camera name="back" pos="0 3 2.4" xyaxes="-1 0 0 0 -1 2" mode="trackcom"/>
    <camera name="side" pos="-3 0 2.4" xyaxes="0 -1 0 1 0 2" mode="trackcom"/>
    <body name="root" childclass="humanoid" pos="0 0 1" euler="90 0 0">
      <site name="root" size=".01" rgba="0.5 0.5 0.5 0"/>
      <freejoint name="root"/>
      <geom name="root_geom" size="0.09 0.06" pos="0 -0.05 0" quat="1 0 -1 0"/>
      <body name="lhipjoint">
        <geom name="lhipjoint" size="0.008 0.022" pos="0.051 -0.046 0.025" quat="0.5708 -0.566602 -0.594264 0"/>
        <body name="lfemur" pos="0.102 -0.092 0.05" quat="1 0 0 0.17365">
          <joint name="lfemurrz" axis="0 0 1" range="-60 70" class="stiff_medium"/>
          <joint name="lfemurry" axis="0 1 0" range="-70 70" class="stiff_medium"/>
          <joint name="lfemurrx" axis="1 0 0" range="-160 20" class="stiff_medium"/>
          <geom name="lfemur" size="0.06 0.17" pos="-.01 -0.202473 0" quat="0.7 -0.7 -0.1228 -0.07"/>
          <body name="ltibia" pos="0 -0.404945 0">
            <joint name="ltibiarx" axis="1 0 0" range="1   170" class="stiff_low"/>
            <geom name="ltibia" size="0.03 0.1825614" pos="0 -0.202846 0" quat="0.7 -0.7 -0.1228 -0.1228"/>
            <geom name="lcalf" size="0.045 0.08" pos="0 -0.1 -.01" quat="0.7 -0.7 -0.1228 -0.1228"/>
            <body name="lfoot" pos="0 -0.405693 0" quat="0.707107 -0.707107 0 0">
              <site name="lfoot_touch" type="box" pos="-.005 -.02 -0.025" size=".04 .08 .02" euler="10 0 0" class="touch"/>
              <joint name="lfootrz" axis="0 0 1" range="-70 20" class="stiff_medium"/>
              <joint name="lfootrx" axis="1 0 0" range="-45    90" class="stiff_medium"/>
              <geom name="lfoot0" size="0.02 0.06" pos="-0.02 -0.023 -0.01" euler="100 -2 0"/>
              <geom name="lfoot1" size="0.02 0.06" pos="0 -0.023 -0.01" euler="100 0 0"/>
              <geom name="lfoot2" size="0.02 0.06" pos=".01 -0.023 -0.01" euler="100 10 0"/>
              <body name="ltoes" pos="0 -0.106372 -0.0227756">
                <joint name="ltoesrx" axis="1 0 0" range="-90 20"/>
                <geom name="ltoes0" type="sphere" size="0.02" pos="-.025 -0.01 -.01"/>
                <geom name="ltoes1" type="sphere" size="0.02" pos="0 -0.005 -.01"/>
                <geom name="ltoes2" type="sphere" size="0.02" pos=".02 .001 -.01"/>
                <site name="ltoes_touch" type="capsule" pos="-.005 -.005 -.01" size="0.025 0.02" zaxis="1 .2 0" class="touch"/>
              </body>
            </body>
          </body>
        </body>
      </body>
      <body name="rhipjoint">
        <geom name="rhipjoint" size="0.008 0.022" pos="-0.051 -0.046 0.025" quat="0.574856 -0.547594 0.608014 0"/>
        <body name="rfemur" pos="-0.102 -0.092 0.05" quat="1 0 0 -0.17365">
          <joint name="rfemurrz" axis="0 0 1" range="-70 60" class="stiff_medium"/>
          <joint name="rfemurry" axis="0 1 0" range="-70 70" class="stiff_medium"/>
          <joint name="rfemurrx" axis="1 0 0" range="-160 20" class="stiff_medium"/>
          <geom name="rfemur" size="0.06 0.17" pos=".01 -0.202473 0" quat="0.7 -0.7 0.1228 0.07"/>
          <body name="rtibia" pos="0 -0.404945 0">
            <joint name="rtibiarx" axis="1 0 0" range="1   170" class="stiff_low"/>
            <geom name="rtibia" size="0.03 0.1825614" pos="0 -0.202846 0" quat="0.7 -0.7 0.1228 0.1228"/>
            <geom name="rcalf" size="0.045 0.08" pos="0 -0.1 -.01" quat="0.7 -0.7 -0.1228 -0.1228"/>
            <body name="rfoot" pos="0 -0.405693 0" quat="0.707107 -0.707107 0 0">
              <site name="rfoot_touch" type="box" pos=".005 -.02 -0.025" size=".04 .08 .02" euler="10 0 0" class="touch"/>
              <joint name="rfootrz" axis="0 0 1" range="-20 70" class="stiff_medium"/>
              <joint name="rfootrx" axis="1 0 0" range="-45    90" class="stiff_medium"/>
              <geom name="rfoot0" size="0.02 0.06" pos="0.02 -0.023 -0.01" euler="100 2 0"/>
              <geom name="rfoot1" size="0.02 0.06" pos="0 -0.023 -0.01" euler="100 0 0"/>
              <geom name="rfoot2" size="0.02 0.06" pos="-.01 -0.023 -0.01" euler="100 -10 0"/>
              <body name="rtoes" pos="0 -0.106372 -0.0227756">
                <joint name="rtoesrx" axis="1 0 0" range="-90 20"/>
                <geom name="rtoes0" type="sphere" size="0.02" pos=".025 -0.01 -.01"/>
                <geom name="rtoes1" type="sphere" size="0.02" pos="0 -0.005 -.01"/>
                <geom name="rtoes2" type="sphere" size="0.02" pos="-.02 .001 -.01"/>
                <site name="rtoes_touch" type="capsule" pos=".005 -.005 -.01" size="0.025 0.02" zaxis="1 -.2 0" class="touch"/>
              </body>
            </body>
          </body>
        </body>
      </body>
      <body name="lowerback">
        <joint name="lowerbackrz" axis="0 0 1" range="-30 30" class="stiff_high"/>
        <joint name="lowerbackry" axis="0 1 0" range="-30 30" class="stiff_high"/>
        <joint name="lowerbackrx" axis="1 0 0" range="-20 45" class="stiff_high"/>
        <geom name="lowerback" size="0.065 0.055" pos="0 0.056 .03" quat="1 0 1 0"/>
        <body name="upperback" pos="0 0.1 -0.01">
          <joint name="upperbackrz" axis="0 0 1" range="-30 30" class="stiff_high"/>
          <joint name="upperbackry" axis="0 1 0" range="-30 30" class="stiff_high"/>
          <joint name="upperbackrx" axis="1 0 0" range="-20 45" class="stiff_high"/>
          <geom name="upperback" size="0.06 0.06" pos="0 0.06 0.02" quat="1 0 1 0"/>
          <body name="thorax" pos="0.000512528 0.11356 0.000936821">
            <joint name="thoraxrz" axis="0 0 1" range="-30 30" class="stiff_high"/>
            <joint name="thoraxry" axis="0 1 0" range="-30 30" class="stiff_high"/>
            <joint name="thoraxrx" axis="1 0 0" range="-20 45" class="stiff_high"/>
            <geom name="thorax" size="0.08 0.07" pos="0 0.05 0" quat="1 0 1 0"/>
            <body name="lowerneck" pos="0 0.113945 0.00468037">
              <joint name="lowerneckrz" axis="0 0 1" range="-30 30" class="stiff_medium"/>
              <joint name="lowerneckry" axis="0 1 0" range="-30 30" class="stiff_medium"/>
              <joint name="lowerneckrx" axis="1 0 0" range="-20 45" class="stiff_medium"/>
              <geom name="lowerneck" size="0.08 0.02" pos="0 0.04 -.02" quat="1 1 0 0"/>
              <body name="upperneck" pos="0 0.09 0.01">
                <joint name="upperneckrz" axis="0 0 1" range="-30 30" class="stiff_medium"/>
                <joint name="upperneckry" axis="0 1 0" range="-30 30" class="stiff_medium"/>
                <joint name="upperneckrx" axis="1 0 0" range="-20 45" class="stiff_medium"/>
                <geom name="upperneck" size="0.05 0.03" pos="0 0.05 0" quat=".8 1 0 0"/>
                <body name="head" pos="0 0.09 0">
                  <camera name="egocentric" pos="0 0 0" xyaxes="-1 0 0 0 1 0" fovy="80"/>
                  <joint name="headrz" axis="0 0 1" range="-30 30" class="stiff_medium"/>
                  <joint name="headry" axis="0 1 0" range="-30 30" class="stiff_medium"/>
                  <joint name="headrx" axis="1 0 0" range="-20 45" class="stiff_medium"/>
                  <geom name="head" size="0.085 0.035" pos="0 0.11 0.03" quat="1 .9 0 0"/>
                  <geom name="leye" type="sphere" size="0.02" pos=" .03 0.11 0.1"/>
                  <geom name="reye" type="sphere" size="0.02" pos="-.03 0.11 0.1"/>
                </body>
              </body>
            </body>
            <body name="lclavicle" pos="0 0.113945 0.00468037">
              <joint name="lclaviclerz" axis="0 0 1" range="0 20" class="stiff_high"/>
              <joint name="lclaviclery" axis="0 1 0" range="-20 10" class="stiff_high"/>
              <geom name="lclavicle" size="0.08 0.04" pos="0.09 0.05 -.01" quat="1 0 -1 -.4"/>
              <body name="lhumerus" pos="0.183 0.076 0.01" quat="0.18 0.68 -0.68 0.18">
                <joint name="lhumerusrz" axis="0 0 1" range="-90 90" class="stiff_low"/>
                <joint name="lhumerusry" axis="0 1 0" range="-90 90" class="stiff_low"/>
                <joint name="lhumerusrx" axis="1 0 0" range="-60 90" class="stiff_low"/>
                <geom name="lhumerus" size="0.035 0.124" pos="0 -0.138 0" quat="0.612 -0.612 0.35 0.35"/>
                <body name="lradius" pos="0 -0.277 0">
                  <joint name="lradiusrx" axis="1 0 0" range="-10 170" class="stiff_low"/>
                  <geom name="lradius" size="0.03 0.06" pos="0 -0.08 0" quat="0.612 -0.612 0.35 0.35"/>
                  <body name="lwrist" pos="0 -0.17 0" quat="-0.5 0 0.866 0">
                    <joint name="lwristry" axis="0 1 0" range="-180 0"/>
                    <geom name="lwrist" size="0.025 0.03" pos="0 -0.02 0" quat="0 0 -1 -1"/>
                    <body name="lhand" pos="0 -0.08 0">
                      <joint name="lhandrz" axis="0 0 1" range="-45 45"/>
                      <joint name="lhandrx" axis="1 0 0" range="-90 90"/>
                      <geom name="lhand" type="ellipsoid" size=".048 0.02 0.06" pos="0 -0.047 0" quat="0 0 -1 -1"/>
                      <body name="lfingers" pos="0 -0.08 0">
                        <joint name="lfingersrx" axis="1 0 0" range="0 90"/>
                        <geom name="lfinger0" size="0.01 0.04" pos="-.03 -0.05 0" quat="1 -1 0 0" />
                        <geom name="lfinger1" size="0.01 0.04" pos="-.008 -0.06 0" quat="1 -1 0 0" />
                        <geom name="lfinger2" size="0.009 0.04" pos=".014 -0.06 0" quat="1 -1 0 0" />
                        <geom name="lfinger3" size="0.008 0.04" pos=".032 -0.05 0" quat="1 -1 0 0" />
                      </body>
                      <body name="lthumb" pos="-.02 -.03 0" quat="0.92388 0 0 -0.382683">
                        <joint name="lthumbrz" axis="0 0 1" range="-45 45"/>
                        <joint name="lthumbrx" axis="1 0 0" range="0 90"/>
                        <geom name="lthumb" size="0.012 0.04" pos="0 -0.06 0" quat="0 0 -1 -1"/>
                      </body>
                    </body>
                  </body>
                </body>
              </body>
            </body>
            <body name="rclavicle" pos="0 0.113945 0.00468037">
              <joint name="rclaviclerz" axis="0 0 1" range="-20 0" class="stiff_high"/>
              <joint name="rclaviclery" axis="0 1 0" range="-10 20" class="stiff_high"/>
              <geom name="rclavicle" size="0.08 0.04" pos="-.09 0.05 -.01" quat="1 0 -1 .4"/>
              <body name="rhumerus" pos="-0.183 0.076 0.01" quat="0.18 0.68 0.68 -0.18">
                <joint name="rhumerusrz" axis="0 0 1" range="-90 90" class="stiff_low"/>
                <joint name="rhumerusry" axis="0 1 0" range="-90 90" class="stiff_low"/>
                <joint name="rhumerusrx" axis="1 0 0" range="-60 90" class="stiff_low"/>
                <geom name="rhumerus" size="0.035 0.124" pos="0 -0.138 0" quat="0.61 -0.61 -0.35 -0.35"/>
                <body name="rradius" pos="0 -0.277 0">
                  <joint name="rradiusrx" axis="1 0 0" range="-10 170" class="stiff_low"/>
                  <geom name="rradius" size="0.03 0.06" pos="0 -0.08 0" quat="0.612 -0.612 -0.35 -0.35"/>
                  <body name="rwrist" pos="0 -0.17 0" quat="-0.5 0 -0.866 0">
                    <joint name="rwristry" axis="0 1 0" range="-180 0"/>
                    <geom name="rwrist" size="0.025 0.03" pos="0 -0.02 0" quat="0 0 1 1"/>
                    <body name="rhand" pos="0 -0.08 0">
                      <joint name="rhandrz" axis="0 0 1" range="-45 45"/>
                      <joint name="rhandrx" axis="1 0 0" range="-90 90"/>
                      <geom name="rhand" type="ellipsoid" size=".048 0.02 .06" pos="0 -0.047 0" quat="0 0 1 1"/>
                      <body name="rfingers" pos="0 -0.08 0">
                        <joint name="rfingersrx" axis="1 0 0" range="0 90"/>
                        <geom name="rfinger0" size="0.01 0.04" pos=".03 -0.05 0" quat="1 -1  0 0" />
                        <geom name="rfinger1" size="0.01 0.04" pos=".008 -0.06 0" quat="1 -1  0 0" />
                        <geom name="rfinger2" size="0.009 0.04" pos="-.014 -0.06 0" quat="1 -1  0 0" />
                        <geom name="rfinger3" size="0.008 0.04" pos="-.032 -0.05 0" quat="1 -1  0 0" />
                      </body>
                      <body name="rthumb" pos=".02 -.03 0" quat="0.92388 0 0 0.382683">
                        <joint name="rthumbrz" axis="0 0 1" range="-45    45"/>
                        <joint name="rthumbrx" axis="1 0 0" range="0 90"/>
                        <geom name="rthumb" size="0.012 0.04" pos="0 -0.06 0" quat="0 0 1 1"/>
                      </body>
                    </body>
                  </body>
                </body>
              </body>
            </body>
          </body>
        </body>
      </body>
    </body>
  </worldbody>
  <contact>
    <exclude body1="lclavicle" body2="rclavicle"/>
    <exclude body1="lowerneck" body2="lclavicle"/>
    <exclude body1="lowerneck" body2="rclavicle"/>
    <exclude body1="upperneck" body2="lclavicle"/>
    <exclude body1="upperneck" body2="rclavicle"/>
  </contact>
  <actuator>
    <motor name="headrx" joint="headrx" gear="20"/>
    <motor name="headry" joint="headry" gear="20"/>
    <motor name="headrz" joint="headrz" gear="20"/>
    <motor name="lclaviclery" joint="lclaviclery" gear="20"/>
    <motor name="lclaviclerz" joint="lclaviclerz" gear="20"/>
    <motor name="lfemurrx" joint="lfemurrx" gear="120"/>
    <motor name="lfemurry" joint="lfemurry" gear="40"/>
    <motor name="lfemurrz" joint="lfemurrz" gear="40"/>
    <motor name="lfingersrx" joint="lfingersrx" gear="20"/>
    <motor name="lfootrx" joint="lfootrx" gear="20"/>
    <motor name="lfootrz" joint="lfootrz" gear="20"/>
    <motor name="lhandrx" joint="lhandrx" gear="20"/>
    <motor name="lhandrz" joint="lhandrz" gear="20"/>
    <motor name="lhumerusrx" joint="lhumerusrx" gear="40"/>
    <motor name="lhumerusry" joint="lhumerusry" gear="40"/>
    <motor name="lhumerusrz" joint="lhumerusrz" gear="40"/>
    <motor name="lowerbackrx" joint="lowerbackrx" gear="40"/>
    <motor name="lowerbackry" joint="lowerbackry" gear="40"/>
    <motor name="lowerbackrz" joint="lowerbackrz" gear="40"/>
    <motor name="lowerneckrx" joint="lowerneckrx" gear="20"/>
    <motor name="lowerneckry" joint="lowerneckry" gear="20"/>
    <motor name="lowerneckrz" joint="lowerneckrz" gear="20"/>
    <motor name="lradiusrx" joint="lradiusrx" gear="40"/>
    <motor name="lthumbrx" joint="lthumbrx" gear="20"/>
    <motor name="lthumbrz" joint="lthumbrz" gear="20"/>
    <motor name="ltibiarx" joint="ltibiarx" gear="80"/>
    <motor name="ltoesrx" joint="ltoesrx" gear="20"/>
    <motor name="lwristry" joint="lwristry" gear="20"/>
    <motor name="rclaviclery" joint="rclaviclery" gear="20"/>
    <motor name="rclaviclerz" joint="rclaviclerz" gear="20"/>
    <motor name="rfemurrx" joint="rfemurrx" gear="120"/>
    <motor name="rfemurry" joint="rfemurry" gear="40"/>
    <motor name="rfemurrz" joint="rfemurrz" gear="40"/>
    <motor name="rfingersrx" joint="rfingersrx" gear="20"/>
    <motor name="rfootrx" joint="rfootrx" gear="20"/>
    <motor name="rfootrz" joint="rfootrz" gear="20"/>
    <motor name="rhandrx" joint="rhandrx" gear="20"/>
    <motor name="rhandrz" joint="rhandrz" gear="20"/>
    <motor name="rhumerusrx" joint="rhumerusrx" gear="40"/>
    <motor name="rhumerusry" joint="rhumerusry" gear="40"/>
    <motor name="rhumerusrz" joint="rhumerusrz" gear="40"/>
    <motor name="rradiusrx" joint="rradiusrx" gear="40"/>
    <motor name="rthumbrx" joint="rthumbrx" gear="20"/>
    <motor name="rthumbrz" joint="rthumbrz" gear="20"/>
    <motor name="rtibiarx" joint="rtibiarx" gear="80"/>
    <motor name="rtoesrx" joint="rtoesrx" gear="20"/>
    <motor name="rwristry" joint="rwristry" gear="20"/>
    <motor name="thoraxrx" joint="thoraxrx" gear="40"/>
    <motor name="thoraxry" joint="thoraxry" gear="40"/>
    <motor name="thoraxrz" joint="thoraxrz" gear="40"/>
    <motor name="upperbackrx" joint="upperbackrx" gear="40"/>
    <motor name="upperbackry" joint="upperbackry" gear="40"/>
    <motor name="upperbackrz" joint="upperbackrz" gear="40"/>
    <motor name="upperneckrx" joint="upperneckrx" gear="20"/>
    <motor name="upperneckry" joint="upperneckry" gear="20"/>
    <motor name="upperneckrz" joint="upperneckrz" gear="20"/>
  </actuator>
  <sensor>
    <subtreelinvel name="thorax_subtreelinvel" body="thorax"/>
    <velocimeter name="sensor_root_veloc" site="root"/>
    <gyro name="sensor_root_gyro" site="root"/>
    <accelerometer name="sensor_root_accel" site="root"/>
    <touch name="sensor_touch_ltoes" site="ltoes_touch"/>
    <touch name="sensor_touch_rtoes" site="rtoes_touch"/>
    <touch name="sensor_touch_rfoot" site="rfoot_touch"/>
    <touch name="sensor_touch_lfoot" site="lfoot_touch"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/lqr.py
+++ b/Dreamer/local_dm_control_suite/lqr.py
@ -0,0 +1,272 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Procedurally generated LQR domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 import os
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import xml_tools
 from lxml import etree
 import numpy as np
 from six.moves import range
 from dm_control.utils import io as resources
 _DEFAULT_TIME_LIMIT = float('inf')
 _CONTROL_COST_COEF = 0.1
 SUITE = containers.TaggedTasks()
 def get_model_and_assets(n_bodies, n_actuators, random):
  """Returns the model description as an XML string and a dict of assets.
  Args:
    n_bodies: An int, number of bodies of the LQR.
    n_actuators: An int, number of actuated bodies of the LQR. `n_actuators`
      should be less or equal than `n_bodies`.
    random: A `numpy.random.RandomState` instance.
  Returns:
    A tuple `(model_xml_string, assets)`, where `assets` is a dict consisting of
    `{filename: contents_string}` pairs.
  """
  return _make_model(n_bodies, n_actuators, random), common.ASSETS
@SUITE.add()
 def lqr_2_1(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns an LQR environment with 2 bodies of which the first is actuated."""
  return _make_lqr(n_bodies=2,
                   n_actuators=1,
                   control_cost_coef=_CONTROL_COST_COEF,
                   time_limit=time_limit,
                   random=random,
                   environment_kwargs=environment_kwargs)
@SUITE.add()
 def lqr_6_2(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns an LQR environment with 6 bodies of which first 2 are actuated."""
  return _make_lqr(n_bodies=6,
                   n_actuators=2,
                   control_cost_coef=_CONTROL_COST_COEF,
                   time_limit=time_limit,
                   random=random,
                   environment_kwargs=environment_kwargs)
 def _make_lqr(n_bodies, n_actuators, control_cost_coef, time_limit, random,
              environment_kwargs):
  """Returns a LQR environment.
  Args:
    n_bodies: An int, number of bodies of the LQR.
    n_actuators: An int, number of actuated bodies of the LQR. `n_actuators`
      should be less or equal than `n_bodies`.
    control_cost_coef: A number, the coefficient of the control cost.
    time_limit: An int, maximum time for each episode in seconds.
    random: Either an existing `numpy.random.RandomState` instance, an
      integer seed for creating a new `RandomState`, or None to select a seed
      automatically.
    environment_kwargs: A `dict` specifying keyword arguments for the
      environment, or None.
  Returns:
    A LQR environment with `n_bodies` bodies of which first `n_actuators` are
    actuated.
  """
  if not isinstance(random, np.random.RandomState):
    random = np.random.RandomState(random)
  model_string, assets = get_model_and_assets(n_bodies, n_actuators,
                                              random=random)
  physics = Physics.from_xml_string(model_string, assets=assets)
  task = LQRLevel(control_cost_coef, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             **environment_kwargs)
 def _make_body(body_id, stiffness_range, damping_range, random):
  """Returns an `etree.Element` defining a body.
  Args:
    body_id: Id of the created body.
    stiffness_range: A tuple of (stiffness_lower_bound, stiffness_uppder_bound).
      The stiffness of the joint is drawn uniformly from this range.
    damping_range: A tuple of (damping_lower_bound, damping_upper_bound). The
      damping of the joint is drawn uniformly from this range.
    random: A `numpy.random.RandomState` instance.
  Returns:
   A new instance of `etree.Element`. A body element with two children: joint
   and geom.
  """
  body_name = 'body_{}'.format(body_id)
  joint_name = 'joint_{}'.format(body_id)
  geom_name = 'geom_{}'.format(body_id)
  body = etree.Element('body', name=body_name)
  body.set('pos', '.25 0 0')
  joint = etree.SubElement(body, 'joint', name=joint_name)
  body.append(etree.Element('geom', name=geom_name))
  joint.set('stiffness',
            str(random.uniform(stiffness_range[0], stiffness_range[1])))
  joint.set('damping',
            str(random.uniform(damping_range[0], damping_range[1])))
  return body
 def _make_model(n_bodies,
                n_actuators,
                random,
                stiffness_range=(15, 25),
                damping_range=(0, 0)):
  """Returns an MJCF XML string defining a model of springs and dampers.
  Args:
    n_bodies: An integer, the number of bodies (DoFs) in the system.
    n_actuators: An integer, the number of actuated bodies.
    random: A `numpy.random.RandomState` instance.
    stiffness_range: A tuple containing minimum and maximum stiffness. Each
      joint's stiffness is sampled uniformly from this interval.
    damping_range: A tuple containing minimum and maximum damping. Each joint's
      damping is sampled uniformly from this interval.
  Returns:
    An MJCF string describing the linear system.
  Raises:
    ValueError: If the number of bodies or actuators is erronous.
  """
  if n_bodies < 1 or n_actuators < 1:
    raise ValueError('At least 1 body and 1 actuator required.')
  if n_actuators > n_bodies:
    raise ValueError('At most 1 actuator per body.')
  file_path = os.path.join(os.path.dirname(__file__), 'lqr.xml')
  with resources.GetResourceAsFile(file_path) as xml_file:
    mjcf = xml_tools.parse(xml_file)
  parent = mjcf.find('./worldbody')
  actuator = etree.SubElement(mjcf.getroot(), 'actuator')
  tendon = etree.SubElement(mjcf.getroot(), 'tendon')
  for body in range(n_bodies):
    # Inserting body.
    child = _make_body(body, stiffness_range, damping_range, random)
    site_name = 'site_{}'.format(body)
    child.append(etree.Element('site', name=site_name))
    if body == 0:
      child.set('pos', '.25 0 .1')
    # Add actuators to the first n_actuators bodies.
    if body < n_actuators:
      # Adding actuator.
      joint_name = 'joint_{}'.format(body)
      motor_name = 'motor_{}'.format(body)
      child.find('joint').set('name', joint_name)
      actuator.append(etree.Element('motor', name=motor_name, joint=joint_name))
    # Add a tendon between consecutive bodies (for visualisation purposes only).
    if body < n_bodies - 1:
      child_site_name = 'site_{}'.format(body + 1)
      tendon_name = 'tendon_{}'.format(body)
      spatial = etree.SubElement(tendon, 'spatial', name=tendon_name)
      spatial.append(etree.Element('site', site=site_name))
      spatial.append(etree.Element('site', site=child_site_name))
    parent.append(child)
    parent = child
  return etree.tostring(mjcf, pretty_print=True)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the LQR domain."""
  def state_norm(self):
    """Returns the norm of the physics state."""
    return np.linalg.norm(self.state())
 class LQRLevel(base.Task):
  """A Linear Quadratic Regulator `Task`."""
  _TERMINAL_TOL = 1e-6
  def __init__(self, control_cost_coef, random=None):
    """Initializes an LQR level with cost = sum(states^2) + c*sum(controls^2).
    Args:
      control_cost_coef: The coefficient of the control cost.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    Raises:
      ValueError: If the control cost coefficient is not positive.
    """
    if control_cost_coef <= 0:
      raise ValueError('control_cost_coef must be positive.')
    self._control_cost_coef = control_cost_coef
    super(LQRLevel, self).__init__(random=random)
  @property
  def control_cost_coef(self):
    return self._control_cost_coef
  def initialize_episode(self, physics):
    """Random state sampled from a unit sphere."""
    ndof = physics.model.nq
    unit = self.random.randn(ndof)
    physics.data.qpos[:] = np.sqrt(2) * unit / np.linalg.norm(unit)
    super(LQRLevel, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the state."""
    obs = collections.OrderedDict()
    obs['position'] = physics.position()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a quadratic state and control reward."""
    position = physics.position()
    state_cost = 0.5 * np.dot(position, position)
    control_signal = physics.control()
    control_l2_norm = 0.5 * np.dot(control_signal, control_signal)
    return 1 - (state_cost + control_l2_norm * self._control_cost_coef)
  def get_evaluation(self, physics):
    """Returns a sparse evaluation reward that is not used for learning."""
    return float(physics.state_norm() <= 0.01)
  def get_termination(self, physics):
    """Terminates when the state norm is smaller than epsilon."""
    if physics.state_norm() < self._TERMINAL_TOL:
      return 0.0
--- a/Dreamer/local_dm_control_suite/lqr.xml
+++ b/Dreamer/local_dm_control_suite/lqr.xml
@ -0,0 +1,26 @@
 <mujoco model="LQR">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep=".03"/>
  <default>
    <joint type="slide" axis="0 1 0"/>
    <geom type="sphere" size=".1" material="self"/>
    <site size=".01"/>
    <tendon width=".02" material="self"/>
  </default>
  <option>
    <flag constraint="disable"/>
  </option>
  <worldbody>
    <light name="light" pos="0 0 2"/>
    <camera name="cam0" pos="-1.428 -0.311 0.856" xyaxes="0.099 -0.995 0.000 0.350 0.035 0.936"/>
    <camera name="cam1" pos="1.787 2.452 4.331" xyaxes="-1 0 0 0 -0.868 0.497"/>
    <geom name="floor" size="4 1 .2" type="plane" material="grid"/>
    <geom name="origin" pos="2 0 .05" size="2 .003 .05" type="box" rgba=".5 .5 .5 .5"/>
  </worldbody>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/lqr_solver.py
+++ b/Dreamer/local_dm_control_suite/lqr_solver.py
@ -0,0 +1,142 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 r"""Optimal policy for LQR levels.
 LQR control problem is described in
 https://en.wikipedia.org/wiki/Linear-quadratic_regulator#Infinite-horizon.2C_discrete-time_LQR
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from absl import logging
 from dm_control.mujoco import wrapper
 import numpy as np
 from six.moves import range
 try:
  import scipy.linalg as sp  # pylint: disable=g-import-not-at-top
 except ImportError:
  sp = None
 def _solve_dare(a, b, q, r):
  """Solves the Discrete-time Algebraic Riccati Equation (DARE) by iteration.
  Algebraic Riccati Equation:
  ```none
  P_{t-1} = Q + A' * P_{t} * A -
            A' * P_{t} * B * (R + B' * P_{t} * B)^{-1} * B' * P_{t} * A
  ```
  Args:
    a: A 2 dimensional numpy array, transition matrix A.
    b: A 2 dimensional numpy array, control matrix B.
    q: A 2 dimensional numpy array, symmetric positive definite cost matrix.
    r: A 2 dimensional numpy array, symmetric positive definite cost matrix
  Returns:
    A numpy array, a real symmetric matrix P which is the solution to DARE.
  Raises:
    RuntimeError: If the computed P matrix is not symmetric and
      positive-definite.
  """
  p = np.eye(len(a))
  for _ in range(1000000):
    a_p = a.T.dot(p)  # A' * P_t
    a_p_b = np.dot(a_p, b)  # A' * P_t * B
    # Algebraic Riccati Equation.
    p_next = q + np.dot(a_p, a) - a_p_b.dot(
        np.linalg.solve(b.T.dot(p.dot(b)) + r, a_p_b.T))
    p_next += p_next.T
    p_next *= .5
    if np.abs(p - p_next).max() < 1e-12:
      break
    p = p_next
  else:
    logging.warning('DARE solver did not converge')
  try:
    # Check that the result is symmetric and positive-definite.
    np.linalg.cholesky(p_next)
  except np.linalg.LinAlgError:
    raise RuntimeError('ARE solver failed: P matrix is not symmetric and '
                       'positive-definite.')
  return p_next
 def solve(env):
  """Returns the optimal value and policy for LQR problem.
  Args:
    env: An instance of `control.EnvironmentV2` with LQR level.
  Returns:
    p: A numpy array, the Hessian of the optimal total cost-to-go (value
      function at state x) is V(x) = .5 * x' * p * x.
    k: A numpy array which gives the optimal linear policy u = k * x.
    beta: The maximum eigenvalue of (a + b * k). Under optimal policy, at
      timestep n the state tends to 0 like beta^n.
  Raises:
    RuntimeError: If the controlled system is unstable.
  """
  n = env.physics.model.nq  # number of DoFs
  m = env.physics.model.nu  # number of controls
  # Compute the mass matrix.
  mass = np.zeros((n, n))
  wrapper.mjbindings.mjlib.mj_fullM(env.physics.model.ptr, mass,
                                    env.physics.data.qM)
  # Compute input matrices a, b, q and r to the DARE solvers.
  # State transition matrix a.
  stiffness = np.diag(env.physics.model.jnt_stiffness.ravel())
  damping = np.diag(env.physics.model.dof_damping.ravel())
  dt = env.physics.model.opt.timestep
  j = np.linalg.solve(-mass, np.hstack((stiffness, damping)))
  a = np.eye(2 * n) + dt * np.vstack(
      (dt * j + np.hstack((np.zeros((n, n)), np.eye(n))), j))
  # Control transition matrix b.
  b = env.physics.data.actuator_moment.T
  bc = np.linalg.solve(mass, b)
  b = dt * np.vstack((dt * bc, bc))
  # State cost Hessian q.
  q = np.diag(np.hstack([np.ones(n), np.zeros(n)]))
  # Control cost Hessian r.
  r = env.task.control_cost_coef * np.eye(m)
  if sp:
    # Use scipy's faster DARE solver if available.
    solve_dare = sp.solve_discrete_are
  else:
    # Otherwise fall back on a slower internal implementation.
    solve_dare = _solve_dare
  # Solve the discrete algebraic Riccati equation.
  p = solve_dare(a, b, q, r)
  k = -np.linalg.solve(b.T.dot(p.dot(b)) + r, b.T.dot(p.dot(a)))
  # Under optimal policy, state tends to 0 like beta^n_timesteps
  beta = np.abs(np.linalg.eigvals(a + b.dot(k))).max()
  if beta >= 1.0:
    raise RuntimeError('Controlled system is unstable.')
  return p, k, beta
--- a/Dreamer/local_dm_control_suite/manipulator.py
+++ b/Dreamer/local_dm_control_suite/manipulator.py
@ -0,0 +1,290 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Planar Manipulator domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 from dm_control.utils import xml_tools
 from lxml import etree
 import numpy as np
 _CLOSE = .01    # (Meters) Distance below which a thing is considered close.
 _CONTROL_TIMESTEP = .01  # (Seconds)
 _TIME_LIMIT = 10  # (Seconds)
 _P_IN_HAND = .1  # Probabillity of object-in-hand initial state
 _P_IN_TARGET = .1  # Probabillity of object-in-target initial state
 _ARM_JOINTS = ['arm_root', 'arm_shoulder', 'arm_elbow', 'arm_wrist',
               'finger', 'fingertip', 'thumb', 'thumbtip']
 _ALL_PROPS = frozenset(['ball', 'target_ball', 'cup',
                        'peg', 'target_peg', 'slot'])
 SUITE = containers.TaggedTasks()
 def make_model(use_peg, insert):
  """Returns a tuple containing the model XML string and a dict of assets."""
  xml_string = common.read_model('manipulator.xml')
  parser = etree.XMLParser(remove_blank_text=True)
  mjcf = etree.XML(xml_string, parser)
  # Select the desired prop.
  if use_peg:
    required_props = ['peg', 'target_peg']
    if insert:
      required_props += ['slot']
  else:
    required_props = ['ball', 'target_ball']
    if insert:
      required_props += ['cup']
  # Remove unused props
  for unused_prop in _ALL_PROPS.difference(required_props):
    prop = xml_tools.find_element(mjcf, 'body', unused_prop)
    prop.getparent().remove(prop)
  return etree.tostring(mjcf, pretty_print=True), common.ASSETS
@SUITE.add('benchmarking', 'hard')
 def bring_ball(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
               environment_kwargs=None):
  """Returns manipulator bring task with the ball prop."""
  use_peg = False
  insert = False
  physics = Physics.from_xml_string(*make_model(use_peg, insert))
  task = Bring(use_peg=use_peg, insert=insert,
               fully_observable=fully_observable, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
@SUITE.add('hard')
 def bring_peg(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
              environment_kwargs=None):
  """Returns manipulator bring task with the peg prop."""
  use_peg = True
  insert = False
  physics = Physics.from_xml_string(*make_model(use_peg, insert))
  task = Bring(use_peg=use_peg, insert=insert,
               fully_observable=fully_observable, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
@SUITE.add('hard')
 def insert_ball(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
                environment_kwargs=None):
  """Returns manipulator insert task with the ball prop."""
  use_peg = False
  insert = True
  physics = Physics.from_xml_string(*make_model(use_peg, insert))
  task = Bring(use_peg=use_peg, insert=insert,
               fully_observable=fully_observable, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
@SUITE.add('hard')
 def insert_peg(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
               environment_kwargs=None):
  """Returns manipulator insert task with the peg prop."""
  use_peg = True
  insert = True
  physics = Physics.from_xml_string(*make_model(use_peg, insert))
  task = Bring(use_peg=use_peg, insert=insert,
               fully_observable=fully_observable, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics with additional features for the Planar Manipulator domain."""
  def bounded_joint_pos(self, joint_names):
    """Returns joint positions as (sin, cos) values."""
    joint_pos = self.named.data.qpos[joint_names]
    return np.vstack([np.sin(joint_pos), np.cos(joint_pos)]).T
  def joint_vel(self, joint_names):
    """Returns joint velocities."""
    return self.named.data.qvel[joint_names]
  def body_2d_pose(self, body_names, orientation=True):
    """Returns positions and/or orientations of bodies."""
    if not isinstance(body_names, str):
      body_names = np.array(body_names).reshape(-1, 1)  # Broadcast indices.
    pos = self.named.data.xpos[body_names, ['x', 'z']]
    if orientation:
      ori = self.named.data.xquat[body_names, ['qw', 'qy']]
      return np.hstack([pos, ori])
    else:
      return pos
  def touch(self):
    return np.log1p(self.data.sensordata)
  def site_distance(self, site1, site2):
    site1_to_site2 = np.diff(self.named.data.site_xpos[[site2, site1]], axis=0)
    return np.linalg.norm(site1_to_site2)
 class Bring(base.Task):
  """A Bring `Task`: bring the prop to the target."""
  def __init__(self, use_peg, insert, fully_observable, random=None):
    """Initialize an instance of the `Bring` task.
    Args:
      use_peg: A `bool`, whether to replace the ball prop with the peg prop.
      insert: A `bool`, whether to insert the prop in a receptacle.
      fully_observable: A `bool`, whether the observation should contain the
        position and velocity of the object being manipulated and the target
        location.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._use_peg = use_peg
    self._target = 'target_peg' if use_peg else 'target_ball'
    self._object = 'peg' if self._use_peg else 'ball'
    self._object_joints = ['_'.join([self._object, dim]) for dim in 'xzy']
    self._receptacle = 'slot' if self._use_peg else 'cup'
    self._insert = insert
    self._fully_observable = fully_observable
    super(Bring, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    # Local aliases
    choice = self.random.choice
    uniform = self.random.uniform
    model = physics.named.model
    data = physics.named.data
    # Find a collision-free random initial configuration.
    penetrating = True
    while penetrating:
      # Randomise angles of arm joints.
      is_limited = model.jnt_limited[_ARM_JOINTS].astype(np.bool)
      joint_range = model.jnt_range[_ARM_JOINTS]
      lower_limits = np.where(is_limited, joint_range[:, 0], -np.pi)
      upper_limits = np.where(is_limited, joint_range[:, 1], np.pi)
      angles = uniform(lower_limits, upper_limits)
      data.qpos[_ARM_JOINTS] = angles
      # Symmetrize hand.
      data.qpos['finger'] = data.qpos['thumb']
      # Randomise target location.
      target_x = uniform(-.4, .4)
      target_z = uniform(.1, .4)
      if self._insert:
        target_angle = uniform(-np.pi/3, np.pi/3)
        model.body_pos[self._receptacle, ['x', 'z']] = target_x, target_z
        model.body_quat[self._receptacle, ['qw', 'qy']] = [
            np.cos(target_angle/2), np.sin(target_angle/2)]
      else:
        target_angle = uniform(-np.pi, np.pi)
      model.body_pos[self._target, ['x', 'z']] = target_x, target_z
      model.body_quat[self._target, ['qw', 'qy']] = [
          np.cos(target_angle/2), np.sin(target_angle/2)]
      # Randomise object location.
      object_init_probs = [_P_IN_HAND, _P_IN_TARGET, 1-_P_IN_HAND-_P_IN_TARGET]
      init_type = choice(['in_hand', 'in_target', 'uniform'],
                         p=object_init_probs)
      if init_type == 'in_target':
        object_x = target_x
        object_z = target_z
        object_angle = target_angle
      elif init_type == 'in_hand':
        physics.after_reset()
        object_x = data.site_xpos['grasp', 'x']
        object_z = data.site_xpos['grasp', 'z']
        grasp_direction = data.site_xmat['grasp', ['xx', 'zx']]
        object_angle = np.pi-np.arctan2(grasp_direction[1], grasp_direction[0])
      else:
        object_x = uniform(-.5, .5)
        object_z = uniform(0, .7)
        object_angle = uniform(0, 2*np.pi)
        data.qvel[self._object + '_x'] = uniform(-5, 5)
      data.qpos[self._object_joints] = object_x, object_z, object_angle
      # Check for collisions.
      physics.after_reset()
      penetrating = physics.data.ncon > 0
    super(Bring, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns either features or only sensors (to be used with pixels)."""
    obs = collections.OrderedDict()
    obs['arm_pos'] = physics.bounded_joint_pos(_ARM_JOINTS)
    obs['arm_vel'] = physics.joint_vel(_ARM_JOINTS)
    obs['touch'] = physics.touch()
    if self._fully_observable:
      obs['hand_pos'] = physics.body_2d_pose('hand')
      obs['object_pos'] = physics.body_2d_pose(self._object)
      obs['object_vel'] = physics.joint_vel(self._object_joints)
      obs['target_pos'] = physics.body_2d_pose(self._target)
    return obs
  def _is_close(self, distance):
    return rewards.tolerance(distance, (0, _CLOSE), _CLOSE*2)
  def _peg_reward(self, physics):
    """Returns a reward for bringing the peg prop to the target."""
    grasp = self._is_close(physics.site_distance('peg_grasp', 'grasp'))
    pinch = self._is_close(physics.site_distance('peg_pinch', 'pinch'))
    grasping = (grasp + pinch) / 2
    bring = self._is_close(physics.site_distance('peg', 'target_peg'))
    bring_tip = self._is_close(physics.site_distance('target_peg_tip',
                                                     'peg_tip'))
    bringing = (bring + bring_tip) / 2
    return max(bringing, grasping/3)
  def _ball_reward(self, physics):
    """Returns a reward for bringing the ball prop to the target."""
    return self._is_close(physics.site_distance('ball', 'target_ball'))
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    if self._use_peg:
      return self._peg_reward(physics)
    else:
      return self._ball_reward(physics)
--- a/Dreamer/local_dm_control_suite/manipulator.xml
+++ b/Dreamer/local_dm_control_suite/manipulator.xml
@ -0,0 +1,211 @@
 <mujoco model="planar manipulator">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <asset>
    <texture name="background" builtin="flat" type="2d" mark="random" markrgb="1 1 1" width="800" height="800" rgb1=".2 .3 .4"/>
    <material name="background" texture="background" texrepeat="1 1" texuniform="true"/>
  </asset>
  <visual>
    <map shadowclip=".5"/>
    <quality shadowsize="2048"/>
  </visual>>
  <option timestep="0.001" cone="elliptic"/>
  <default>
    <geom friction=".7" solimp="0.9 0.97 0.001" solref=".005 1"/>
    <joint solimplimit="0 0.99 0.01" solreflimit=".005 1"/>
    <general ctrllimited="true"/>
    <tendon width="0.01"/>
    <site size=".003 .003 .003" material="site" group="3"/>
    <default class="arm">
      <geom type="capsule" material="self" density="500"/>
      <joint type="hinge" pos="0 0 0" axis="0 -1 0" limited="true"/>
      <default class="hand">
        <joint damping=".5" range="-10 60"/>
        <geom size=".008"/>
        <site  type="box" size=".018 .005 .005" pos=".022 0 -.002" euler="0 15 0" group="4"/>
        <default class="fingertip">
          <geom type="sphere" size=".008" material="effector"/>
          <joint damping=".01" stiffness=".01" range="-40 20"/>
          <site  size=".012 .005 .008" pos=".003 0 .003" group="4" euler="0 0 0"/>
        </default>
      </default>
    </default>
    <default class="object">
      <geom material="self"/>
    </default>
    <default class="task">
      <site rgba="0 0 0 0"/>
    </default>
    <default class="obstacle">
      <geom material="decoration" friction="0"/>
    </default>
    <default class="ghost">
      <geom material="target" contype="0" conaffinity="0"/>
    </default>
  </default>
  <worldbody>
    <!-- Arena -->
    <light name="light" directional="true" diffuse=".6 .6 .6" pos="0 0 1" specular=".3 .3 .3"/>
    <geom name="floor" type="plane" pos="0 0 0" size=".4 .2 10" material="grid"/>
    <geom name="wall1" type="plane" pos="-.682843 0 .282843" size=".4 .2 10" material="grid" zaxis="1 0 1"/>
    <geom name="wall2" type="plane" pos=".682843 0 .282843" size=".4 .2 10" material="grid" zaxis="-1 0 1"/>
    <geom name="background" type="plane" pos="0 .2 .5" size="1 .5 10" material="background" zaxis="0 -1 0"/>
    <camera name="fixed" pos="0 -16 .4" xyaxes="1 0 0 0 0 1" fovy="4"/>
    <!-- Arm -->
    <geom name="arm_root" type="cylinder" fromto="0 -.022 .4 0 .022 .4" size=".024"
          material="decoration" contype="0" conaffinity="0"/>
    <body name="upper_arm" pos="0 0 .4" childclass="arm">
      <joint name="arm_root" damping="2" limited="false"/>
      <geom  name="upper_arm"  size=".02" fromto="0 0 0 0 0 .18"/>
      <body  name="middle_arm" pos="0 0 .18" childclass="arm">
        <joint name="arm_shoulder" damping="1.5" range="-160 160"/>
        <geom  name="middle_arm"  size=".017" fromto="0 0 0 0 0 .15"/>
        <body  name="lower_arm" pos="0 0 .15">
          <joint name="arm_elbow" damping="1" range="-160 160"/>
          <geom  name="lower_arm" size=".014" fromto="0 0 0 0 0 .12"/>
          <body  name="hand" pos="0 0 .12">
            <joint name="arm_wrist" damping=".5" range="-140 140" />
            <geom  name="hand" size=".011" fromto="0 0 0 0 0 .03"/>
            <geom  name="palm1"  fromto="0 0 .03  .03 0 .045" class="hand"/>
            <geom  name="palm2"  fromto="0 0 .03 -.03 0 .045" class="hand"/>
            <site  name="grasp" pos="0 0 .065"/>
            <body  name="pinch site" pos="0 0 .090">
              <site  name="pinch"/>
              <inertial pos="0 0 0" mass="1e-6" diaginertia="1e-12 1e-12 1e-12"/>
              <camera name="hand" pos="0 -.3 0" xyaxes="1 0 0 0 0 1" mode="track"/>
            </body>
            <site  name="palm_touch" type="box" group="4" size=".025 .005 .008" pos="0 0 .043"/>
            <body name="thumb" pos=".03 0 .045" euler="0 -90 0" childclass="hand">
              <joint name="thumb"/>
              <geom  name="thumb1"  fromto="0 0 0 .02 0 -.01" size=".007"/>
              <geom  name="thumb2"  fromto=".02 0 -.01 .04 0 -.01" size=".007"/>
              <site  name="thumb_touch" group="4"/>
              <body  name="thumbtip" pos=".05 0 -.01" childclass="fingertip">
                <joint name="thumbtip"/>
                <geom  name="thumbtip1" pos="-.003 0 0" />
                <geom  name="thumbtip2" pos=".003 0 0" />
                <site  name="thumbtip_touch" group="4"/>
              </body>
            </body>
            <body name="finger" pos="-.03 0 .045" euler="0 90 180" childclass="hand">
              <joint name="finger"/>
              <geom  name="finger1"  fromto="0 0 0 .02 0 -.01" size=".007" />
              <geom  name="finger2"  fromto=".02 0 -.01 .04 0 -.01" size=".007"/>
              <site  name="finger_touch"/>
              <body  name="fingertip" pos=".05 0 -.01" childclass="fingertip">
                <joint name="fingertip"/>
                <geom  name="fingertip1" pos="-.003 0 0" />
                <geom  name="fingertip2" pos=".003 0 0" />
                <site  name="fingertip_touch"/>
              </body>
            </body>
          </body>
        </body>
      </body>
    </body>
    <!-- props -->
    <body name="ball" pos=".4 0 .4" childclass="object">
      <joint name="ball_x" type="slide" axis="1 0 0" ref=".4"/>
      <joint name="ball_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="ball_y" type="hinge" axis="0 1 0"/>
      <geom  name="ball" type="sphere" size=".022" />
      <site  name="ball" type="sphere"/>
    </body>
    <body name="peg" pos="-.4 0 .4" childclass="object">
      <joint name="peg_x" type="slide" axis="1 0 0" ref="-.4"/>
      <joint name="peg_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="peg_y" type="hinge" axis="0 1 0"/>
      <geom name="blade" type="capsule" size=".005" fromto="0 0 -.013 0 0 -.113"/>
      <geom name="guard" type="capsule" size=".005" fromto="-.017 0 -.043 .017 0 -.043"/>
      <body name="pommel" pos="0 0 -.013">
        <geom name="pommel" type="sphere" size=".009"/>
      </body>
      <site name="peg" type="box" pos="0 0 -.063"/>
      <site name="peg_pinch" type="box" pos="0 0 -.025"/>
      <site name="peg_grasp" type="box" pos="0 0 0"/>
      <site name="peg_tip"   type="box" pos="0 0 -.113"/>
    </body>
    <!-- receptacles -->
    <body name="slot" pos="-.405 0 .2" euler="0 20 0" childclass="obstacle">
      <geom name="slot_0" type="box" pos="-.0252 0 -.083" size=".0198 .01 .035"/>
      <geom name="slot_1" type="box" pos=" .0252 0 -.083" size=".0198 .01 .035"/>
      <geom name="slot_2" type="box" pos="  0   0 -.138" size=".045 .01 .02"/>
      <site name="slot" type="box" pos="0 0 0"/>
      <site name="slot_end" type="box" pos="0 0 -.05"/>
    </body>
    <body name="cup" pos=".3 0 .4" euler="0 -15 0" childclass="obstacle">
      <geom name="cup_0" type="capsule" size=".008" fromto="-.03 0 .06 -.03 0 -.015" />
      <geom name="cup_1" type="capsule" size=".008" fromto="-.03 0 -.015 0 0 -.04" />
      <geom name="cup_2" type="capsule" size=".008" fromto="0 0 -.04 .03 0 -.015" />
      <geom name="cup_3" type="capsule" size=".008" fromto=".03 0 -.015 .03 0 .06" />
      <site name="cup" size=".005"/>
    </body>
    <!-- targets -->
    <body name="target_ball" pos=".4 .001 .4" childclass="ghost">
      <geom  name="target_ball" type="sphere" size=".02" />
      <site  name="target_ball" type="sphere"/>
    </body>
    <body name="target_peg" pos="-.2 .001 .4" childclass="ghost">
      <geom name="target_blade" type="capsule" size=".005" fromto="0 0 -.013 0 0 -.113"/>
      <geom name="target_guard" type="capsule" size=".005" fromto="-.017 0 -.043 .017 0 -.043"/>
      <geom name="target_pommel" type="sphere" size=".009" pos="0 0 -.013"/>
      <site name="target_peg" type="box" pos="0 0 -.063"/>
      <site name="target_peg_pinch" type="box" pos="0 0 -.025"/>
      <site name="target_peg_grasp" type="box" pos="0 0 0"/>
      <site name="target_peg_tip"   type="box" pos="0 0 -.113"/>
    </body>
  </worldbody>
  <tendon>
    <fixed name="grasp">
      <joint joint="thumb"  coef=".5"/>
      <joint joint="finger" coef=".5"/>
    </fixed>
    <fixed name="coupling">
      <joint joint="thumb"  coef="-.5"/>
      <joint joint="finger" coef=".5"/>
    </fixed>
  </tendon>
  <equality>
    <tendon name="coupling" tendon1="coupling" solimp="0.95 0.99 0.001" solref=".005 .5"/>
  </equality>
  <sensor>
    <touch name="palm_touch" site="palm_touch"/>
    <touch name="finger_touch" site="finger_touch"/>
    <touch name="thumb_touch" site="thumb_touch"/>
    <touch name="fingertip_touch" site="fingertip_touch"/>
    <touch name="thumbtip_touch" site="thumbtip_touch"/>
  </sensor>
  <actuator>
    <motor name="root"     joint="arm_root"     ctrlrange="-1 1"  gear="12"/>
    <motor name="shoulder" joint="arm_shoulder" ctrlrange="-1 1"  gear="8"/>
    <motor name="elbow"    joint="arm_elbow"    ctrlrange="-1 1"  gear="4"/>
    <motor name="wrist"    joint="arm_wrist"    ctrlrange="-1 1"  gear="2"/>
    <motor name="grasp"    tendon="grasp"       ctrlrange="-1 1"  gear="2"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/pendulum.py
+++ b/Dreamer/local_dm_control_suite/pendulum.py
@ -0,0 +1,114 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Pendulum domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 20
 _ANGLE_BOUND = 8
 _COSINE_BOUND = np.cos(np.deg2rad(_ANGLE_BOUND))
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('pendulum.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def swingup(time_limit=_DEFAULT_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns pendulum swingup task ."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = SwingUp(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Pendulum domain."""
  def pole_vertical(self):
    """Returns vertical (z) component of pole frame."""
    return self.named.data.xmat['pole', 'zz']
  def angular_velocity(self):
    """Returns the angular velocity of the pole."""
    return self.named.data.qvel['hinge'].copy()
  def pole_orientation(self):
    """Returns both horizontal and vertical components of pole frame."""
    return self.named.data.xmat['pole', ['zz', 'xz']]
 class SwingUp(base.Task):
  """A Pendulum `Task` to swing up and balance the pole."""
  def __init__(self, random=None):
    """Initialize an instance of `Pendulum`.
    Args:
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    super(SwingUp, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Pole is set to a random angle between [-pi, pi).
    Args:
      physics: An instance of `Physics`.
    """
    physics.named.data.qpos['hinge'] = self.random.uniform(-np.pi, np.pi)
    super(SwingUp, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation.
    Observations are states concatenating pole orientation and angular velocity
    and pixels from fixed camera.
    Args:
      physics: An instance of `physics`, Pendulum physics.
    Returns:
      A `dict` of observation.
    """
    obs = collections.OrderedDict()
    obs['orientation'] = physics.pole_orientation()
    obs['velocity'] = physics.angular_velocity()
    return obs
  def get_reward(self, physics):
    return rewards.tolerance(physics.pole_vertical(), (_COSINE_BOUND, 1))
--- a/Dreamer/local_dm_control_suite/pendulum.xml
+++ b/Dreamer/local_dm_control_suite/pendulum.xml
@ -0,0 +1,26 @@
 <mujoco model="pendulum">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.02">
    <flag contact="disable" energy="enable"/>
  </option>
  <worldbody>
    <light name="light" pos="0 0 2"/>
    <geom name="floor" size="2 2 .2" type="plane" material="grid"/>
    <camera name="fixed" pos="0 -1.5 2" xyaxes='1 0 0 0 1 1'/>
    <camera name="lookat" mode="targetbodycom" target="pole" pos="0 -2 1"/>
    <body name="pole" pos="0 0 .6">
      <joint name="hinge" type="hinge" axis="0 1 0" damping="0.1"/>
      <geom name="base" material="decoration" type="cylinder" fromto="0 -.03 0 0 .03 0" size="0.021" mass="0"/>
      <geom name="pole" material="self" type="capsule" fromto="0 0 0 0 0 0.5" size="0.02" mass="0"/>
      <geom name="mass" material="effector" type="sphere" pos="0 0 0.5" size="0.05" mass="1"/>
    </body>
  </worldbody>
  <actuator>
    <motor name="torque" joint="hinge" gear="1" ctrlrange="-1 1" ctrllimited="true"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/point_mass.py
+++ b/Dreamer/local_dm_control_suite/point_mass.py
@ -0,0 +1,130 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Point-mass domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 _DEFAULT_TIME_LIMIT = 20
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('point_mass.xml'), common.ASSETS
@SUITE.add('benchmarking', 'easy')
 def easy(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the easy point_mass task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = PointMass(randomize_gains=False, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add()
 def hard(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the hard point_mass task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = PointMass(randomize_gains=True, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
 class Physics(mujoco.Physics):
  """physics for the point_mass domain."""
  def mass_to_target(self):
    """Returns the vector from mass to target in global coordinate."""
    return (self.named.data.geom_xpos['target'] -
            self.named.data.geom_xpos['pointmass'])
  def mass_to_target_dist(self):
    """Returns the distance from mass to the target."""
    return np.linalg.norm(self.mass_to_target())
 class PointMass(base.Task):
  """A point_mass `Task` to reach target with smooth reward."""
  def __init__(self, randomize_gains, random=None):
    """Initialize an instance of `PointMass`.
    Args:
      randomize_gains: A `bool`, whether to randomize the actuator gains.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._randomize_gains = randomize_gains
    super(PointMass, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
       If _randomize_gains is True, the relationship between the controls and
       the joints is randomized, so that each control actuates a random linear
       combination of joints.
    Args:
      physics: An instance of `mujoco.Physics`.
    """
    randomizers.randomize_limited_and_rotational_joints(physics, self.random)
    if self._randomize_gains:
      dir1 = self.random.randn(2)
      dir1 /= np.linalg.norm(dir1)
      # Find another actuation direction that is not 'too parallel' to dir1.
      parallel = True
      while parallel:
        dir2 = self.random.randn(2)
        dir2 /= np.linalg.norm(dir2)
        parallel = abs(np.dot(dir1, dir2)) > 0.9
      physics.model.wrap_prm[[0, 1]] = dir1
      physics.model.wrap_prm[[2, 3]] = dir2
    super(PointMass, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the state."""
    obs = collections.OrderedDict()
    obs['position'] = physics.position()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    target_size = physics.named.model.geom_size['target', 0]
    near_target = rewards.tolerance(physics.mass_to_target_dist(),
                                    bounds=(0, target_size), margin=target_size)
    control_reward = rewards.tolerance(physics.control(), margin=1,
                                       value_at_margin=0,
                                       sigmoid='quadratic').mean()
    small_control = (control_reward + 4) / 5
    return near_target * small_control
--- a/Dreamer/local_dm_control_suite/point_mass.xml
+++ b/Dreamer/local_dm_control_suite/point_mass.xml
@ -0,0 +1,49 @@
 <mujoco model="planar point mass">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.02">
    <flag contact="disable"/>
  </option>
  <default>
    <joint type="hinge" axis="0 0 1" limited="true" range="-.29 .29" damping="1"/>
    <motor gear=".1" ctrlrange="-1 1" ctrllimited="true"/>
  </default>
  <worldbody>
    <light name="light" pos="0 0 1"/>
    <camera name="fixed" pos="0 0 .75" quat="1 0 0 0"/>
    <geom name="ground" type="plane" pos="0 0 0" size=".3 .3 .1" material="grid"/>
    <geom name="wall_x" type="plane" pos="-.3 0 .02" zaxis="1 0 0"  size=".02 .3 .02" material="decoration"/>
    <geom name="wall_y" type="plane" pos="0 -.3 .02" zaxis="0 1 0"  size=".3 .02 .02" material="decoration"/>
    <geom name="wall_neg_x" type="plane" pos=".3 0 .02" zaxis="-1 0 0"  size=".02 .3 .02" material="decoration"/>
    <geom name="wall_neg_y" type="plane" pos="0 .3 .02" zaxis="0 -1 0"  size=".3 .02 .02" material="decoration"/>
    <body name="pointmass" pos="0 0 .01">
      <camera name="cam0" pos="0 -0.3 0.3" xyaxes="1 0 0 0 0.7 0.7"/>
      <joint name="root_x" type="slide"  pos="0 0 0" axis="1 0 0" />
      <joint name="root_y" type="slide"  pos="0 0 0" axis="0 1 0" />
      <geom name="pointmass" type="sphere" size=".01" material="self" mass=".3"/>
    </body>
    <geom name="target" pos="0 0 .01" material="target" type="sphere" size=".015"/>
  </worldbody>
  <tendon>
    <fixed name="t1">
      <joint joint="root_x" coef="1"/>
      <joint joint="root_y" coef="0"/>
    </fixed>
    <fixed name="t2">
      <joint joint="root_x" coef="0"/>
      <joint joint="root_y" coef="1"/>
    </fixed>
  </tendon>
  <actuator>
    <motor name="t1" tendon="t1"/>
    <motor name="t2" tendon="t2"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/quadruped.py
+++ b/Dreamer/local_dm_control_suite/quadruped.py
@ -0,0 +1,480 @@
 # Copyright 2019 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Quadruped Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.mujoco.wrapper import mjbindings
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 from dm_control.utils import xml_tools
 from lxml import etree
 import numpy as np
 from scipy import ndimage
 enums = mjbindings.enums
 mjlib = mjbindings.mjlib
 _DEFAULT_TIME_LIMIT = 20
 _CONTROL_TIMESTEP = .02
 # Horizontal speeds above which the move reward is 1.
 _RUN_SPEED = 5
 _WALK_SPEED = 0.5
 # Constants related to terrain generation.
 _HEIGHTFIELD_ID = 0
 _TERRAIN_SMOOTHNESS = 0.15  # 0.0: maximally bumpy; 1.0: completely smooth.
 _TERRAIN_BUMP_SCALE = 2  # Spatial scale of terrain bumps (in meters).
 # Named model elements.
 _TOES = ['toe_front_left', 'toe_back_left', 'toe_back_right', 'toe_front_right']
 _WALLS = ['wall_px', 'wall_py', 'wall_nx', 'wall_ny']
 SUITE = containers.TaggedTasks()
 def make_model(floor_size=None, terrain=False, rangefinders=False,
               walls_and_ball=False):
  """Returns the model XML string."""
  xml_string = common.read_model('quadruped.xml')
  parser = etree.XMLParser(remove_blank_text=True)
  mjcf = etree.XML(xml_string, parser)
  # Set floor size.
  if floor_size is not None:
    floor_geom = mjcf.find('.//geom[@name={!r}]'.format('floor'))
    floor_geom.attrib['size'] = '{} {} .5'.format(floor_size, floor_size)
  # Remove walls, ball and target.
  if not walls_and_ball:
    for wall in _WALLS:
      wall_geom = xml_tools.find_element(mjcf, 'geom', wall)
      wall_geom.getparent().remove(wall_geom)
    # Remove ball.
    ball_body = xml_tools.find_element(mjcf, 'body', 'ball')
    ball_body.getparent().remove(ball_body)
    # Remove target.
    target_site = xml_tools.find_element(mjcf, 'site', 'target')
    target_site.getparent().remove(target_site)
  # Remove terrain.
  if not terrain:
    terrain_geom = xml_tools.find_element(mjcf, 'geom', 'terrain')
    terrain_geom.getparent().remove(terrain_geom)
  # Remove rangefinders if they're not used, as range computations can be
  # expensive, especially in a scene with heightfields.
  if not rangefinders:
    rangefinder_sensors = mjcf.findall('.//rangefinder')
    for rf in rangefinder_sensors:
      rf.getparent().remove(rf)
  return etree.tostring(mjcf, pretty_print=True)
@SUITE.add()
 def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _WALK_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Move(desired_speed=_WALK_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)
@SUITE.add()
 def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Run task."""
  xml_string = make_model(floor_size=_DEFAULT_TIME_LIMIT * _RUN_SPEED)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Move(desired_speed=_RUN_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)
@SUITE.add()
 def escape(time_limit=_DEFAULT_TIME_LIMIT, random=None,
           environment_kwargs=None):
  """Returns the Escape task."""
  xml_string = make_model(floor_size=40, terrain=True, rangefinders=True)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Escape(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)
@SUITE.add()
 def fetch(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Fetch task."""
  xml_string = make_model(walls_and_ball=True)
  physics = Physics.from_xml_string(xml_string, common.ASSETS)
  task = Fetch(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(physics, task, time_limit=time_limit,
                             control_timestep=_CONTROL_TIMESTEP,
                             **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Quadruped domain."""
  def _reload_from_data(self, data):
    super(Physics, self)._reload_from_data(data)
    # Clear cached sensor names when the physics is reloaded.
    self._sensor_types_to_names = {}
    self._hinge_names = []
  def _get_sensor_names(self, *sensor_types):
    try:
      sensor_names = self._sensor_types_to_names[sensor_types]
    except KeyError:
      [sensor_ids] = np.where(np.in1d(self.model.sensor_type, sensor_types))
      sensor_names = [self.model.id2name(s_id, 'sensor') for s_id in sensor_ids]
      self._sensor_types_to_names[sensor_types] = sensor_names
    return sensor_names
  def torso_upright(self):
    """Returns the dot-product of the torso z-axis and the global z-axis."""
    return np.asarray(self.named.data.xmat['torso', 'zz'])
  def torso_velocity(self):
    """Returns the velocity of the torso, in the local frame."""
    return self.named.data.sensordata['velocimeter'].copy()
  def egocentric_state(self):
    """Returns the state without global orientation or position."""
    if not self._hinge_names:
      [hinge_ids] = np.nonzero(self.model.jnt_type ==
                               enums.mjtJoint.mjJNT_HINGE)
      self._hinge_names = [self.model.id2name(j_id, 'joint')
                           for j_id in hinge_ids]
    return np.hstack((self.named.data.qpos[self._hinge_names],
                      self.named.data.qvel[self._hinge_names],
                      self.data.act))
  def toe_positions(self):
    """Returns toe positions in egocentric frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    torso_to_toe = self.named.data.xpos[_TOES] - torso_pos
    return torso_to_toe.dot(torso_frame)
  def force_torque(self):
    """Returns scaled force/torque sensor readings at the toes."""
    force_torque_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_FORCE,
                                                  enums.mjtSensor.mjSENS_TORQUE)
    return np.arcsinh(self.named.data.sensordata[force_torque_sensors])
  def imu(self):
    """Returns IMU-like sensor readings."""
    imu_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_GYRO,
                                         enums.mjtSensor.mjSENS_ACCELEROMETER)
    return self.named.data.sensordata[imu_sensors]
  def rangefinder(self):
    """Returns scaled rangefinder sensor readings."""
    rf_sensors = self._get_sensor_names(enums.mjtSensor.mjSENS_RANGEFINDER)
    rf_readings = self.named.data.sensordata[rf_sensors]
    no_intersection = -1.0
    return np.where(rf_readings == no_intersection, 1.0, np.tanh(rf_readings))
  def origin_distance(self):
    """Returns the distance from the origin to the workspace."""
    return np.asarray(np.linalg.norm(self.named.data.site_xpos['workspace']))
  def origin(self):
    """Returns origin position in the torso frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    return -torso_pos.dot(torso_frame)
  def ball_state(self):
    """Returns ball position and velocity relative to the torso frame."""
    data = self.named.data
    torso_frame = data.xmat['torso'].reshape(3, 3)
    ball_rel_pos = data.xpos['ball'] - data.xpos['torso']
    ball_rel_vel = data.qvel['ball_root'][:3] - data.qvel['root'][:3]
    ball_rot_vel = data.qvel['ball_root'][3:]
    ball_state = np.vstack((ball_rel_pos, ball_rel_vel, ball_rot_vel))
    return ball_state.dot(torso_frame).ravel()
  def target_position(self):
    """Returns target position in torso frame."""
    torso_frame = self.named.data.xmat['torso'].reshape(3, 3)
    torso_pos = self.named.data.xpos['torso']
    torso_to_target = self.named.data.site_xpos['target'] - torso_pos
    return torso_to_target.dot(torso_frame)
  def ball_to_target_distance(self):
    """Returns horizontal distance from the ball to the target."""
    ball_to_target = (self.named.data.site_xpos['target'] -
                      self.named.data.xpos['ball'])
    return np.linalg.norm(ball_to_target[:2])
  def self_to_ball_distance(self):
    """Returns horizontal distance from the quadruped workspace to the ball."""
    self_to_ball = (self.named.data.site_xpos['workspace']
                    -self.named.data.xpos['ball'])
    return np.linalg.norm(self_to_ball[:2])
 def _find_non_contacting_height(physics, orientation, x_pos=0.0, y_pos=0.0):
  """Find a height with no contacts given a body orientation.
  Args:
    physics: An instance of `Physics`.
    orientation: A quaternion.
    x_pos: A float. Position along global x-axis.
    y_pos: A float. Position along global y-axis.
  Raises:
    RuntimeError: If a non-contacting configuration has not been found after
    10,000 attempts.
  """
  z_pos = 0.0  # Start embedded in the floor.
  num_contacts = 1
  num_attempts = 0
  # Move up in 1cm increments until no contacts.
  while num_contacts > 0:
    try:
      with physics.reset_context():
        physics.named.data.qpos['root'][:3] = x_pos, y_pos, z_pos
        physics.named.data.qpos['root'][3:] = orientation
    except control.PhysicsError:
      # We may encounter a PhysicsError here due to filling the contact
      # buffer, in which case we simply increment the height and continue.
      pass
    num_contacts = physics.data.ncon
    z_pos += 0.01
    num_attempts += 1
    if num_attempts > 10000:
      raise RuntimeError('Failed to find a non-contacting configuration.')
 def _common_observations(physics):
  """Returns the observations common to all tasks."""
  obs = collections.OrderedDict()
  obs['egocentric_state'] = physics.egocentric_state()
  obs['torso_velocity'] = physics.torso_velocity()
  obs['torso_upright'] = physics.torso_upright()
  obs['imu'] = physics.imu()
  obs['force_torque'] = physics.force_torque()
  return obs
 def _upright_reward(physics, deviation_angle=0):
  """Returns a reward proportional to how upright the torso is.
  Args:
    physics: an instance of `Physics`.
    deviation_angle: A float, in degrees. The reward is 0 when the torso is
      exactly upside-down and 1 when the torso's z-axis is less than
      `deviation_angle` away from the global z-axis.
  """
  deviation = np.cos(np.deg2rad(deviation_angle))
  return rewards.tolerance(
      physics.torso_upright(),
      bounds=(deviation, float('inf')),
      sigmoid='linear',
      margin=1 + deviation,
      value_at_margin=0)
 class Move(base.Task):
  """A quadruped task solved by moving forward at a designated speed."""
  def __init__(self, desired_speed, random=None):
    """Initializes an instance of `Move`.
    Args:
      desired_speed: A float. If this value is zero, reward is given simply
        for standing upright. Otherwise this specifies the horizontal velocity
        at which the velocity-dependent reward component is maximized.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._desired_speed = desired_speed
    super(Move, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
    super(Move, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation to the agent."""
    return _common_observations(physics)
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    # Move reward term.
    move_reward = rewards.tolerance(
        physics.torso_velocity()[0],
        bounds=(self._desired_speed, float('inf')),
        margin=self._desired_speed,
        value_at_margin=0.5,
        sigmoid='linear')
    return _upright_reward(physics) * move_reward
 class Escape(base.Task):
  """A quadruped task solved by escaping a bowl-shaped terrain."""
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Get heightfield resolution, assert that it is square.
    res = physics.model.hfield_nrow[_HEIGHTFIELD_ID]
    assert res == physics.model.hfield_ncol[_HEIGHTFIELD_ID]
    # Sinusoidal bowl shape.
    row_grid, col_grid = np.ogrid[-1:1:res*1j, -1:1:res*1j]
    radius = np.clip(np.sqrt(col_grid**2 + row_grid**2), .04, 1)
    bowl_shape = .5 - np.cos(2*np.pi*radius)/2
    # Random smooth bumps.
    terrain_size = 2 * physics.model.hfield_size[_HEIGHTFIELD_ID, 0]
    bump_res = int(terrain_size / _TERRAIN_BUMP_SCALE)
    bumps = self.random.uniform(_TERRAIN_SMOOTHNESS, 1, (bump_res, bump_res))
    smooth_bumps = ndimage.zoom(bumps, res / float(bump_res))
    # Terrain is elementwise product.
    terrain = bowl_shape * smooth_bumps
    start_idx = physics.model.hfield_adr[_HEIGHTFIELD_ID]
    physics.model.hfield_data[start_idx:start_idx+res**2] = terrain.ravel()
    super(Escape, self).initialize_episode(physics)
    # If we have a rendering context, we need to re-upload the modified
    # heightfield data.
    if physics.contexts:
      with physics.contexts.gl.make_current() as ctx:
        ctx.call(mjlib.mjr_uploadHField,
                 physics.model.ptr,
                 physics.contexts.mujoco.ptr,
                 _HEIGHTFIELD_ID)
    # Initial configuration.
    orientation = self.random.randn(4)
    orientation /= np.linalg.norm(orientation)
    _find_non_contacting_height(physics, orientation)
  def get_observation(self, physics):
    """Returns an observation to the agent."""
    obs = _common_observations(physics)
    obs['origin'] = physics.origin()
    obs['rangefinder'] = physics.rangefinder()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    # Escape reward term.
    terrain_size = physics.model.hfield_size[_HEIGHTFIELD_ID, 0]
    escape_reward = rewards.tolerance(
        physics.origin_distance(),
        bounds=(terrain_size, float('inf')),
        margin=terrain_size,
        value_at_margin=0,
        sigmoid='linear')
    return _upright_reward(physics, deviation_angle=20) * escape_reward
 class Fetch(base.Task):
  """A quadruped task solved by bringing a ball to the origin."""
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Args:
      physics: An instance of `Physics`.
    """
    # Initial configuration, random azimuth and horizontal position.
    azimuth = self.random.uniform(0, 2*np.pi)
    orientation = np.array((np.cos(azimuth/2), 0, 0, np.sin(azimuth/2)))
    spawn_radius = 0.9 * physics.named.model.geom_size['floor', 0]
    x_pos, y_pos = self.random.uniform(-spawn_radius, spawn_radius, size=(2,))
    _find_non_contacting_height(physics, orientation, x_pos, y_pos)
    # Initial ball state.
    physics.named.data.qpos['ball_root'][:2] = self.random.uniform(
        -spawn_radius, spawn_radius, size=(2,))
    physics.named.data.qpos['ball_root'][2] = 2
    physics.named.data.qvel['ball_root'][:2] = 5*self.random.randn(2)
    super(Fetch, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation to the agent."""
    obs = _common_observations(physics)
    obs['ball_state'] = physics.ball_state()
    obs['target_position'] = physics.target_position()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    # Reward for moving close to the ball.
    arena_radius = physics.named.model.geom_size['floor', 0] * np.sqrt(2)
    workspace_radius = physics.named.model.site_size['workspace', 0]
    ball_radius = physics.named.model.geom_size['ball', 0]
    reach_reward = rewards.tolerance(
        physics.self_to_ball_distance(),
        bounds=(0, workspace_radius+ball_radius),
        sigmoid='linear',
        margin=arena_radius, value_at_margin=0)
    # Reward for bringing the ball to the target.
    target_radius = physics.named.model.site_size['target', 0]
    fetch_reward = rewards.tolerance(
        physics.ball_to_target_distance(),
        bounds=(0, target_radius),
        sigmoid='linear',
        margin=arena_radius, value_at_margin=0)
    reach_then_fetch = reach_reward * (0.5 + 0.5*fetch_reward)
    return _upright_reward(physics) * reach_then_fetch
--- a/Dreamer/local_dm_control_suite/quadruped.xml
+++ b/Dreamer/local_dm_control_suite/quadruped.xml
@ -0,0 +1,329 @@
 <mujoco model="quadruped">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <visual>
    <rgba rangefinder="1 1 0.1 0.1"/>
    <map znear=".005" zfar="20"/>
  </visual>
  <asset>
    <hfield name="terrain" ncol="201" nrow="201" size="30 30 5 .1"/>
  </asset>
  <option timestep=".005"/>
  <default>
    <geom solimp=".9 .99 .003" solref=".01 1"/>
    <default class="body">
      <geom  type="capsule" size=".08" condim="1" material="self" density="500"/>
      <joint type="hinge" damping="30" armature=".01"
             limited="true" solimplimit="0 .99 .01"/>
      <default class="hip">
        <default class="yaw">
          <joint axis="0 0 1" range="-50 50"/>
        </default>
        <default class="pitch">
          <joint axis="0 1 0" range="-20 60"/>
        </default>
        <geom fromto="0 0 0 .3 0 .11"/>
      </default>
      <default class="knee">
        <joint axis="0 1 0" range="-60 50"/>
        <geom size=".065" fromto="0 0 0 .25 0 -.25"/>
      </default>
      <default class="ankle">
        <joint axis="0 1 0" range="-45 55"/>
        <geom size=".055" fromto="0 0 0 0 0 -.25"/>
      </default>
      <default class="toe">
        <geom type="sphere" size=".08" material="effector" friction="1.5"/>
        <site type="sphere" size=".084" material="site"  group="4"/>
      </default>
    </default>
    <default class="rangefinder">
      <site type="capsule" size=".005 .1" material="site" group="4"/>
    </default>
    <default class="wall">
      <geom type="plane" material="decoration"/>
    </default>
    <default class="coupling">
      <equality solimp="0.95 0.99 0.01" solref=".005 .5"/>
    </default>
    <general ctrllimited="true" gainprm="1000" biasprm="0 -1000" biastype="affine" dyntype="filter" dynprm=".1"/>
    <default class="yaw_act">
      <general ctrlrange="-1 1"/>
    </default>
    <default class="lift_act">
      <general ctrlrange="-1 1.1"/>
    </default>
    <default class="extend_act">
      <general ctrlrange="-.8 .8"/>
    </default>
  </default>
  <asset>
    <texture name="ball" builtin="checker" mark="cross" width="151" height="151"
             rgb1="0.1 0.1 0.1" rgb2="0.9 0.9 0.9" markrgb="1 1 1"/>
    <material name="ball" texture="ball" />
  </asset>
  <worldbody>
    <geom name="floor" type="plane" size="15 15 .5" material="grid"/>
    <geom name="wall_px" class="wall" pos="-15.7 0 .7" zaxis="1 0 1"  size="1 15 .5"/>
    <geom name="wall_py" class="wall" pos="0 -15.7 .7" zaxis="0 1 1"  size="15 1 .5"/>
    <geom name="wall_nx" class="wall" pos="15.7 0 .7" zaxis="-1 0 1"  size="1 15 .5"/>
    <geom name="wall_ny" class="wall" pos="0 15.7 .7" zaxis="0 -1 1"  size="15 1 .5"/>
    <site name="target" type="cylinder" size=".4 .06" pos="0 0 .05" material="target"/>
    <geom name="terrain" type="hfield" hfield="terrain" rgba=".2 .3 .4 1" pos="0 0 -.01"/>
    <camera name="global"  pos="-10 10 10" xyaxes="-1 -1 0 1 0 1" mode="trackcom"/>
    <body name="torso" childclass="body" pos="0 0 .57">
      <freejoint name="root"/>
      <camera name="x"  pos="-1.7 0 1" xyaxes="0 -1 0 .75 0 1" mode="trackcom"/>
      <camera name="y"  pos="0 4 2" xyaxes="-1 0 0 0 -.5 1" mode="trackcom"/>
      <camera name="egocentric"  pos=".3 0 .11" xyaxes="0 -1 0 .4 0 1" fovy="60"/>
      <light name="light" pos="0 0 4" mode="trackcom"/>
      <geom name="eye_r" type="cylinder" size=".05"  fromto=".1 -.07 .12 .31 -.07 .08" mass="0"/>
      <site name="pupil_r" type="sphere" size=".033"  pos=".3 -.07 .08" zaxis="1 0 0" material="eye"/>
      <geom name="eye_l" type="cylinder" size=".05"  fromto=".1 .07 .12 .31 .07 .08" mass="0"/>
      <site name="pupil_l" type="sphere" size=".033"  pos=".3 .07 .08" zaxis="1 0 0" material="eye"/>
      <site name="workspace" type="sphere" size=".3 .3 .3"  material="site" pos=".8 0 -.2" group="3"/>
      <site name="rf_00" class="rangefinder" fromto=".41 -.02  .11 .34 0 .115"/>
      <site name="rf_01" class="rangefinder" fromto=".41 -.01  .11 .34 0 .115"/>
      <site name="rf_02" class="rangefinder" fromto=".41   0   .11 .34 0 .115"/>
      <site name="rf_03" class="rangefinder" fromto=".41  .01  .11 .34 0 .115"/>
      <site name="rf_04" class="rangefinder" fromto=".41  .02  .11 .34 0 .115"/>
      <site name="rf_10" class="rangefinder" fromto=".41 -.02  .1  .36 0 .11"/>
      <site name="rf_11" class="rangefinder" fromto=".41 -.02  .1  .36 0 .11"/>
      <site name="rf_12" class="rangefinder" fromto=".41   0   .1  .36 0 .11"/>
      <site name="rf_13" class="rangefinder" fromto=".41  .01  .1  .36 0 .11"/>
      <site name="rf_14" class="rangefinder" fromto=".41  .02  .1  .36 0 .11"/>
      <site name="rf_20" class="rangefinder" fromto=".41 -.02  .09 .38 0 .105"/>
      <site name="rf_21" class="rangefinder" fromto=".41 -.01  .09 .38 0 .105"/>
      <site name="rf_22" class="rangefinder" fromto=".41   0   .09 .38 0 .105"/>
      <site name="rf_23" class="rangefinder" fromto=".41  .01  .09 .38 0 .105"/>
      <site name="rf_24" class="rangefinder" fromto=".41  .02  .09 .38 0 .105"/>
      <site name="rf_30" class="rangefinder" fromto=".41 -.02  .08 .4  0 .1"/>
      <site name="rf_31" class="rangefinder" fromto=".41 -.01  .08 .4  0 .1"/>
      <site name="rf_32" class="rangefinder" fromto=".41   0   .08 .4  0 .1"/>
      <site name="rf_33" class="rangefinder" fromto=".41  .01  .08 .4  0 .1"/>
      <site name="rf_34" class="rangefinder" fromto=".41  .02  .08 .4  0 .1"/>
      <geom name="torso" type="ellipsoid" size=".3 .27 .2" density="1000"/>
      <site name="torso_touch" type="box" size=".26 .26 .26" rgba="0 0 1 0"/>
      <site name="torso" size=".05" rgba="1 0 0 1" />
      <body name="hip_front_left" pos=".2 .2 0" euler="0 0 45" childclass="hip">
        <joint name="yaw_front_left" class="yaw"/>
        <joint name="pitch_front_left" class="pitch"/>
        <geom name="thigh_front_left"/>
        <body name="knee_front_left" pos=".3 0 .11" childclass="knee">
          <joint name="knee_front_left"/>
          <geom name="shin_front_left"/>
          <body name="ankle_front_left" pos=".25 0 -.25" childclass="ankle">
            <joint name="ankle_front_left"/>
            <geom name="foot_front_left"/>
            <body name="toe_front_left" pos="0 0 -.3" childclass="toe">
              <geom name="toe_front_left"/>
              <site name="toe_front_left"/>
            </body>
          </body>
        </body>
      </body>
      <body name="hip_front_right" pos=".2 -.2 0" euler="0 0 -45" childclass="hip">
        <joint name="yaw_front_right" class="yaw"/>
        <joint name="pitch_front_right" class="pitch"/>
        <geom name="thigh_front_right"/>
        <body name="knee_front_right" pos=".3 0 .11" childclass="knee">
          <joint name="knee_front_right"/>
          <geom name="shin_front_right"/>
          <body name="ankle_front_right" pos=".25 0 -.25" childclass="ankle">
            <joint name="ankle_front_right"/>
            <geom name="foot_front_right"/>
            <body name="toe_front_right" pos="0 0 -.3" childclass="toe">
              <geom name="toe_front_right"/>
              <site name="toe_front_right"/>
            </body>
          </body>
        </body>
      </body>
      <body name="hip_back_right" pos="-.2 -.2 0" euler="0 0 -135" childclass="hip">
        <joint name="yaw_back_right" class="yaw"/>
        <joint name="pitch_back_right" class="pitch"/>
        <geom name="thigh_back_right"/>
        <body name="knee_back_right" pos=".3 0 .11" childclass="knee">
          <joint name="knee_back_right"/>
          <geom name="shin_back_right"/>
          <body name="ankle_back_right" pos=".25 0 -.25" childclass="ankle">
            <joint name="ankle_back_right"/>
            <geom name="foot_back_right"/>
            <body name="toe_back_right" pos="0 0 -.3" childclass="toe">
              <geom name="toe_back_right"/>
              <site name="toe_back_right"/>
            </body>
          </body>
        </body>
      </body>
      <body name="hip_back_left" pos="-.2 .2 0" euler="0 0 135" childclass="hip">
        <joint name="yaw_back_left" class="yaw"/>
        <joint name="pitch_back_left" class="pitch"/>
        <geom name="thigh_back_left"/>
        <body name="knee_back_left" pos=".3 0 .11" childclass="knee">
          <joint name="knee_back_left"/>
          <geom name="shin_back_left"/>
          <body name="ankle_back_left" pos=".25 0 -.25" childclass="ankle">
            <joint name="ankle_back_left"/>
            <geom name="foot_back_left"/>
            <body name="toe_back_left" pos="0 0 -.3" childclass="toe">
              <geom name="toe_back_left"/>
              <site name="toe_back_left"/>
            </body>
          </body>
        </body>
      </body>
    </body>
    <body name="ball" pos="0 0 3">
      <freejoint name="ball_root"/>
      <geom name="ball" size=".15" material="ball" priority="1" condim="6" friction=".7 .005 .005"
            solref="-10000 -30"/>
      <light name="ball_light" pos="0 0 4" mode="trackcom"/>
    </body>
  </worldbody>
  <tendon>
    <fixed name="coupling_front_left">
      <joint joint="pitch_front_left"      coef=".333"/>
      <joint joint="knee_front_left"       coef=".333"/>
      <joint joint="ankle_front_left"      coef=".333"/>
    </fixed>
    <fixed name="coupling_front_right">
      <joint joint="pitch_front_right"      coef=".333"/>
      <joint joint="knee_front_right"       coef=".333"/>
      <joint joint="ankle_front_right"      coef=".333"/>
    </fixed>
    <fixed name="coupling_back_right">
      <joint joint="pitch_back_right"      coef=".333"/>
      <joint joint="knee_back_right"       coef=".333"/>
      <joint joint="ankle_back_right"      coef=".333"/>
    </fixed>
    <fixed name="coupling_back_left">
      <joint joint="pitch_back_left"      coef=".333"/>
      <joint joint="knee_back_left"       coef=".333"/>
      <joint joint="ankle_back_left"      coef=".333"/>
    </fixed>
    <fixed name="extend_front_left">
      <joint joint="pitch_front_left"      coef=".25"/>
      <joint joint="knee_front_left"       coef="-.5"/>
      <joint joint="ankle_front_left"      coef=".25"/>
    </fixed>
    <fixed name="lift_front_left">
      <joint joint="pitch_front_left"      coef=".5"/>
      <joint joint="ankle_front_left"      coef="-.5"/>
    </fixed>
    <fixed name="extend_front_right">
      <joint joint="pitch_front_right"     coef=".25"/>
      <joint joint="knee_front_right"      coef="-.5"/>
      <joint joint="ankle_front_right"     coef=".25"/>
    </fixed>
    <fixed name="lift_front_right">
      <joint joint="pitch_front_right"     coef=".5"/>
      <joint joint="ankle_front_right"     coef="-.5"/>
    </fixed>
    <fixed name="extend_back_right">
      <joint joint="pitch_back_right"     coef=".25"/>
      <joint joint="knee_back_right"      coef="-.5"/>
      <joint joint="ankle_back_right"     coef=".25"/>
    </fixed>
    <fixed name="lift_back_right">
      <joint joint="pitch_back_right"     coef=".5"/>
      <joint joint="ankle_back_right"     coef="-.5"/>
    </fixed>
    <fixed name="extend_back_left">
      <joint joint="pitch_back_left"      coef=".25"/>
      <joint joint="knee_back_left"       coef="-.5"/>
      <joint joint="ankle_back_left"      coef=".25"/>
    </fixed>
    <fixed name="lift_back_left">
      <joint joint="pitch_back_left"     coef=".5"/>
      <joint joint="ankle_back_left"     coef="-.5"/>
    </fixed>
  </tendon>
  <equality>
    <tendon name="coupling_front_left" tendon1="coupling_front_left" class="coupling"/>
    <tendon name="coupling_front_right" tendon1="coupling_front_right" class="coupling"/>
    <tendon name="coupling_back_right" tendon1="coupling_back_right" class="coupling"/>
    <tendon name="coupling_back_left" tendon1="coupling_back_left" class="coupling"/>
  </equality>
  <actuator>
    <general name="yaw_front_left" class="yaw_act" joint="yaw_front_left"/>
    <general name="lift_front_left" class="lift_act" tendon="lift_front_left"/>
    <general name="extend_front_left" class="extend_act" tendon="extend_front_left"/>
    <general name="yaw_front_right" class="yaw_act" joint="yaw_front_right"/>
    <general name="lift_front_right" class="lift_act" tendon="lift_front_right"/>
    <general name="extend_front_right" class="extend_act" tendon="extend_front_right"/>
    <general name="yaw_back_right" class="yaw_act" joint="yaw_back_right"/>
    <general name="lift_back_right" class="lift_act" tendon="lift_back_right"/>
    <general name="extend_back_right" class="extend_act" tendon="extend_back_right"/>
    <general name="yaw_back_left" class="yaw_act" joint="yaw_back_left"/>
    <general name="lift_back_left" class="lift_act" tendon="lift_back_left"/>
    <general name="extend_back_left" class="extend_act" tendon="extend_back_left"/>
  </actuator>
  <sensor>
    <accelerometer name="imu_accel" site="torso"/>
    <gyro name="imu_gyro" site="torso"/>
    <velocimeter name="velocimeter" site="torso"/>
    <force name="force_toe_front_left" site="toe_front_left"/>
    <force name="force_toe_front_right" site="toe_front_right"/>
    <force name="force_toe_back_right" site="toe_back_right"/>
    <force name="force_toe_back_left" site="toe_back_left"/>
    <torque name="torque_toe_front_left" site="toe_front_left"/>
    <torque name="torque_toe_front_right" site="toe_front_right"/>
    <torque name="torque_toe_back_right" site="toe_back_right"/>
    <torque name="torque_toe_back_left" site="toe_back_left"/>
    <subtreecom name="center_of_mass" body="torso"/>
    <rangefinder name="rf_00" site="rf_00"/>
    <rangefinder name="rf_01" site="rf_01"/>
    <rangefinder name="rf_02" site="rf_02"/>
    <rangefinder name="rf_03" site="rf_03"/>
    <rangefinder name="rf_04" site="rf_04"/>
    <rangefinder name="rf_10" site="rf_10"/>
    <rangefinder name="rf_11" site="rf_11"/>
    <rangefinder name="rf_12" site="rf_12"/>
    <rangefinder name="rf_13" site="rf_13"/>
    <rangefinder name="rf_14" site="rf_14"/>
    <rangefinder name="rf_20" site="rf_20"/>
    <rangefinder name="rf_21" site="rf_21"/>
    <rangefinder name="rf_22" site="rf_22"/>
    <rangefinder name="rf_23" site="rf_23"/>
    <rangefinder name="rf_24" site="rf_24"/>
    <rangefinder name="rf_30" site="rf_30"/>
    <rangefinder name="rf_31" site="rf_31"/>
    <rangefinder name="rf_32" site="rf_32"/>
    <rangefinder name="rf_33" site="rf_33"/>
    <rangefinder name="rf_34" site="rf_34"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/reacher.py
+++ b/Dreamer/local_dm_control_suite/reacher.py
@ -0,0 +1,116 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Reacher domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 import numpy as np
 SUITE = containers.TaggedTasks()
 _DEFAULT_TIME_LIMIT = 20
 _BIG_TARGET = .05
 _SMALL_TARGET = .015
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('reacher.xml'), common.ASSETS
@SUITE.add('benchmarking', 'easy')
 def easy(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns reacher with sparse reward with 5e-2 tol and randomized target."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Reacher(target_size=_BIG_TARGET, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
@SUITE.add('benchmarking')
 def hard(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns reacher with sparse reward with 1e-2 tol and randomized target."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = Reacher(target_size=_SMALL_TARGET, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Reacher domain."""
  def finger_to_target(self):
    """Returns the vector from target to finger in global coordinates."""
    return (self.named.data.geom_xpos['target', :2] -
            self.named.data.geom_xpos['finger', :2])
  def finger_to_target_dist(self):
    """Returns the signed distance between the finger and target surface."""
    return np.linalg.norm(self.finger_to_target())
 class Reacher(base.Task):
  """A reacher `Task` to reach the target."""
  def __init__(self, target_size, random=None):
    """Initialize an instance of `Reacher`.
    Args:
      target_size: A `float`, tolerance to determine whether finger reached the
          target.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._target_size = target_size
    super(Reacher, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    physics.named.model.geom_size['target', 0] = self._target_size
    randomizers.randomize_limited_and_rotational_joints(physics, self.random)
    # Randomize target position
    angle = self.random.uniform(0, 2 * np.pi)
    radius = self.random.uniform(.05, .20)
    physics.named.model.geom_pos['target', 'x'] = radius * np.sin(angle)
    physics.named.model.geom_pos['target', 'y'] = radius * np.cos(angle)
    super(Reacher, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of the state and the target position."""
    obs = collections.OrderedDict()
    obs['position'] = physics.position()
    obs['to_target'] = physics.finger_to_target()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    radii = physics.named.model.geom_size[['target', 'finger'], 0].sum()
    return rewards.tolerance(physics.finger_to_target_dist(), (0, radii))
--- a/Dreamer/local_dm_control_suite/reacher.xml
+++ b/Dreamer/local_dm_control_suite/reacher.xml
@ -0,0 +1,47 @@
 <mujoco model="two-link planar reacher">
  <include file="./common/skybox.xml"/>
  <include file="./common/visual.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.02">
    <flag contact="disable"/>
  </option>
  <default>
    <joint type="hinge" axis="0 0 1" damping="0.01"/>
    <motor gear=".05" ctrlrange="-1 1" ctrllimited="true"/>
  </default>
  <worldbody>
    <light name="light" pos="0 0 1"/>
    <camera name="fixed" pos="0 0 .75" quat="1 0 0 0"/>
    <!-- Arena -->
    <geom name="ground" type="plane" pos="0 0 0" size=".3 .3 10" material="grid"/>
    <geom name="wall_x" type="plane" pos="-.3 0 .02" zaxis="1 0 0"  size=".02 .3 .02" material="decoration"/>
    <geom name="wall_y" type="plane" pos="0 -.3 .02" zaxis="0 1 0"  size=".3 .02 .02" material="decoration"/>
    <geom name="wall_neg_x" type="plane" pos=".3 0 .02" zaxis="-1 0 0"  size=".02 .3 .02" material="decoration"/>
    <geom name="wall_neg_y" type="plane" pos="0 .3 .02" zaxis="0 -1 0"  size=".3 .02 .02" material="decoration"/>
    <!-- Arm -->
    <geom name="root" type="cylinder" fromto="0 0 0 0 0 0.02" size=".011" material="decoration"/>
    <body name="arm" pos="0 0 .01">
      <geom name="arm" type="capsule" fromto="0 0 0 0.12 0 0" size=".01" material="self"/>
      <joint name="shoulder"/>
      <body name="hand" pos=".12 0 0">
        <geom name="hand" type="capsule" fromto="0 0 0 0.1 0 0" size=".01" material="self"/>
        <joint name="wrist" limited="true" range="-160 160"/>
        <body name="finger" pos=".12 0 0">
          <camera name="hand" pos="0 0 .2" mode="track"/>
          <geom name="finger" type="sphere" size=".01" material="effector"/>
        </body>
      </body>
    </body>
    <!-- Target -->
    <geom name="target" pos="0 0 .01" material="target" type="sphere" size=".05"/>
  </worldbody>
  <actuator>
    <motor name="shoulder" joint="shoulder"/>
    <motor name="wrist" joint="wrist"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/stacker.py
+++ b/Dreamer/local_dm_control_suite/stacker.py
@ -0,0 +1,208 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Planar Stacker domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.utils import containers
 from dm_control.utils import rewards
 from dm_control.utils import xml_tools
 from lxml import etree
 import numpy as np
 _CLOSE = .01    # (Meters) Distance below which a thing is considered close.
 _CONTROL_TIMESTEP = .01  # (Seconds)
 _TIME_LIMIT = 10  # (Seconds)
 _ARM_JOINTS = ['arm_root', 'arm_shoulder', 'arm_elbow', 'arm_wrist',
               'finger', 'fingertip', 'thumb', 'thumbtip']
 SUITE = containers.TaggedTasks()
 def make_model(n_boxes):
  """Returns a tuple containing the model XML string and a dict of assets."""
  xml_string = common.read_model('stacker.xml')
  parser = etree.XMLParser(remove_blank_text=True)
  mjcf = etree.XML(xml_string, parser)
  # Remove unused boxes
  for b in range(n_boxes, 4):
    box = xml_tools.find_element(mjcf, 'body', 'box' + str(b))
    box.getparent().remove(box)
  return etree.tostring(mjcf, pretty_print=True), common.ASSETS
@SUITE.add('hard')
 def stack_2(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns stacker task with 2 boxes."""
  n_boxes = 2
  physics = Physics.from_xml_string(*make_model(n_boxes=n_boxes))
  task = Stack(n_boxes=n_boxes,
               fully_observable=fully_observable,
               random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
@SUITE.add('hard')
 def stack_4(fully_observable=True, time_limit=_TIME_LIMIT, random=None,
            environment_kwargs=None):
  """Returns stacker task with 4 boxes."""
  n_boxes = 4
  physics = Physics.from_xml_string(*make_model(n_boxes=n_boxes))
  task = Stack(n_boxes=n_boxes,
               fully_observable=fully_observable,
               random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, control_timestep=_CONTROL_TIMESTEP, time_limit=time_limit,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics with additional features for the Planar Manipulator domain."""
  def bounded_joint_pos(self, joint_names):
    """Returns joint positions as (sin, cos) values."""
    joint_pos = self.named.data.qpos[joint_names]
    return np.vstack([np.sin(joint_pos), np.cos(joint_pos)]).T
  def joint_vel(self, joint_names):
    """Returns joint velocities."""
    return self.named.data.qvel[joint_names]
  def body_2d_pose(self, body_names, orientation=True):
    """Returns positions and/or orientations of bodies."""
    if not isinstance(body_names, str):
      body_names = np.array(body_names).reshape(-1, 1)  # Broadcast indices.
    pos = self.named.data.xpos[body_names, ['x', 'z']]
    if orientation:
      ori = self.named.data.xquat[body_names, ['qw', 'qy']]
      return np.hstack([pos, ori])
    else:
      return pos
  def touch(self):
    return np.log1p(self.data.sensordata)
  def site_distance(self, site1, site2):
    site1_to_site2 = np.diff(self.named.data.site_xpos[[site2, site1]], axis=0)
    return np.linalg.norm(site1_to_site2)
 class Stack(base.Task):
  """A Stack `Task`: stack the boxes."""
  def __init__(self, n_boxes, fully_observable, random=None):
    """Initialize an instance of the `Stack` task.
    Args:
      n_boxes: An `int`, number of boxes to stack.
      fully_observable: A `bool`, whether the observation should contain the
        positions and velocities of the boxes and the location of the target.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._n_boxes = n_boxes
    self._box_names = ['box' + str(b) for b in range(n_boxes)]
    self._box_joint_names = []
    for name in self._box_names:
      for dim in 'xyz':
        self._box_joint_names.append('_'.join([name, dim]))
    self._fully_observable = fully_observable
    super(Stack, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode."""
    # Local aliases
    randint = self.random.randint
    uniform = self.random.uniform
    model = physics.named.model
    data = physics.named.data
    # Find a collision-free random initial configuration.
    penetrating = True
    while penetrating:
      # Randomise angles of arm joints.
      is_limited = model.jnt_limited[_ARM_JOINTS].astype(np.bool)
      joint_range = model.jnt_range[_ARM_JOINTS]
      lower_limits = np.where(is_limited, joint_range[:, 0], -np.pi)
      upper_limits = np.where(is_limited, joint_range[:, 1], np.pi)
      angles = uniform(lower_limits, upper_limits)
      data.qpos[_ARM_JOINTS] = angles
      # Symmetrize hand.
      data.qpos['finger'] = data.qpos['thumb']
      # Randomise target location.
      target_height = 2*randint(self._n_boxes) + 1
      box_size = model.geom_size['target', 0]
      model.body_pos['target', 'z'] = box_size * target_height
      model.body_pos['target', 'x'] = uniform(-.37, .37)
      # Randomise box locations.
      for name in self._box_names:
        data.qpos[name + '_x'] = uniform(.1, .3)
        data.qpos[name + '_z'] = uniform(0, .7)
        data.qpos[name + '_y'] = uniform(0, 2*np.pi)
      # Check for collisions.
      physics.after_reset()
      penetrating = physics.data.ncon > 0
    super(Stack, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns either features or only sensors (to be used with pixels)."""
    obs = collections.OrderedDict()
    obs['arm_pos'] = physics.bounded_joint_pos(_ARM_JOINTS)
    obs['arm_vel'] = physics.joint_vel(_ARM_JOINTS)
    obs['touch'] = physics.touch()
    if self._fully_observable:
      obs['hand_pos'] = physics.body_2d_pose('hand')
      obs['box_pos'] = physics.body_2d_pose(self._box_names)
      obs['box_vel'] = physics.joint_vel(self._box_joint_names)
      obs['target_pos'] = physics.body_2d_pose('target', orientation=False)
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    box_size = physics.named.model.geom_size['target', 0]
    min_box_to_target_distance = min(physics.site_distance(name, 'target')
                                     for name in self._box_names)
    box_is_close = rewards.tolerance(min_box_to_target_distance,
                                     margin=2*box_size)
    hand_to_target_distance = physics.site_distance('grasp', 'target')
    hand_is_far = rewards.tolerance(hand_to_target_distance,
                                    bounds=(.1, float('inf')),
                                    margin=_CLOSE)
    return box_is_close * hand_is_far
--- a/Dreamer/local_dm_control_suite/stacker.xml
+++ b/Dreamer/local_dm_control_suite/stacker.xml
@ -0,0 +1,193 @@
 <mujoco model="planar stacker">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials_white_floor.xml"/>
  <asset>
    <texture name="background" builtin="flat" type="2d" mark="random" markrgb="1 1 1" width="800" height="800" rgb1=".2 .3 .4"/>
    <material name="background" texture="background" texrepeat="1 1" texuniform="true"/>
  </asset>
  <visual>
    <map shadowclip=".5"/>
    <quality shadowsize="2048"/>
  </visual>>
  <option timestep="0.001" cone="elliptic"/>
  <default>
    <geom friction=".7" solimp="0.9 0.97 0.001" solref=".01 1"/>
    <joint solimplimit="0 0.99 0.01" solreflimit=".005 1"/>
    <general ctrllimited="true"/>
    <tendon width="0.01"/>
    <site size=".003 .003 .003" material="site" group="3"/>
    <default class="arm">
      <geom type="capsule" material="self" density="500"/>
      <joint type="hinge" pos="0 0 0" axis="0 -1 0" limited="true"/>
      <default class="hand">
        <joint damping=".5" range="-10 60"/>
        <geom size=".008"/>
        <site  type="box" size=".018 .005 .005" pos=".022 0 -.002" euler="0 15 0" group="4"/>
        <default class="fingertip">
          <geom type="sphere" size=".008" material="effector"/>
          <joint damping=".01" stiffness=".01" range="-40 20"/>
          <site  size=".012 .005 .008" pos=".003 0 .003" group="4" euler="0 0 0"/>
        </default>
      </default>
    </default>
    <default class="object">
      <geom material="self"/>
    </default>
    <default class="task">
      <site rgba="0 0 0 0"/>
    </default>
    <default class="obstacle">
      <geom material="decoration" friction="0"/>
    </default>
    <default class="ghost">
      <geom material="target" contype="0" conaffinity="0"/>
    </default>
  </default>
  <worldbody>
    <!-- Arena -->
    <light name="light" directional="true" diffuse=".6 .6 .6" pos="0 0 1" specular=".3 .3 .3"/>
    <geom name="floor" type="plane" pos="0 0 0" size=".4 .2 10" material="grid"/>
    <geom name="wall1" type="plane" pos="-.682843 0 .282843" size=".4 .2 10" material="grid" zaxis="1 0 1"/>
    <geom name="wall2" type="plane" pos=".682843 0 .282843" size=".4 .2 10" material="grid" zaxis="-1 0 1"/>
    <geom name="background" type="plane" pos="0 .2 .5" size="1 .5 10" material="background" zaxis="0 -1 0"/>
    <camera name="fixed" pos="0 -16 .4" xyaxes="1 0 0 0 0 1" fovy="4"/>
    <!-- Arm -->
    <geom name="arm_root" type="cylinder" fromto="0 -.022 .4 0 .022 .4" size=".024"
          material="decoration" contype="0" conaffinity="0"/>
    <body name="upper_arm" pos="0 0 .4" childclass="arm">
      <joint name="arm_root" damping="2" limited="false"/>
      <geom  name="upper_arm"  size=".02" fromto="0 0 0 0 0 .18"/>
      <body  name="middle_arm" pos="0 0 .18" childclass="arm">
        <joint name="arm_shoulder" damping="1.5" range="-160 160"/>
        <geom  name="middle_arm"  size=".017" fromto="0 0 0 0 0 .15"/>
        <body  name="lower_arm" pos="0 0 .15">
          <joint name="arm_elbow" damping="1" range="-160 160"/>
          <geom  name="lower_arm" size=".014" fromto="0 0 0 0 0 .12"/>
          <body  name="hand" pos="0 0 .12">
            <joint name="arm_wrist" damping=".5" range="-140 140" />
            <geom  name="hand" size=".011" fromto="0 0 0 0 0 .03"/>
            <geom  name="palm1"  fromto="0 0 .03  .03 0 .045" class="hand"/>
            <geom  name="palm2"  fromto="0 0 .03 -.03 0 .045" class="hand"/>
            <site  name="grasp" pos="0 0 .065"/>
            <body  name="pinch site" pos="0 0 .090">
              <site  name="pinch"/>
              <inertial pos="0 0 0" mass="1e-6" diaginertia="1e-12 1e-12 1e-12"/>
              <camera name="hand" pos="0 -.3 0" xyaxes="1 0 0 0 0 1" mode="track"/>
            </body>
            <site  name="palm_touch" type="box" group="4" size=".025 .005 .008" pos="0 0 .043"/>
            <body name="thumb" pos=".03 0 .045" euler="0 -90 0" childclass="hand">
              <joint name="thumb"/>
              <geom  name="thumb1"  fromto="0 0 0 .02 0 -.01" size=".007"/>
              <geom  name="thumb2"  fromto=".02 0 -.01 .04 0 -.01" size=".007"/>
              <site  name="thumb_touch" group="4"/>
              <body  name="thumbtip" pos=".05 0 -.01" childclass="fingertip">
                <joint name="thumbtip"/>
                <geom  name="thumbtip1" pos="-.003 0 0" />
                <geom  name="thumbtip2" pos=".003 0 0" />
                <site  name="thumbtip_touch" group="4"/>
              </body>
            </body>
            <body name="finger" pos="-.03 0 .045" euler="0 90 180" childclass="hand">
              <joint name="finger"/>
              <geom  name="finger1"  fromto="0 0 0 .02 0 -.01" size=".007" />
              <geom  name="finger2"  fromto=".02 0 -.01 .04 0 -.01" size=".007"/>
              <site  name="finger_touch"/>
              <body  name="fingertip" pos=".05 0 -.01" childclass="fingertip">
                <joint name="fingertip"/>
                <geom  name="fingertip1" pos="-.003 0 0" />
                <geom  name="fingertip2" pos=".003 0 0" />
                <site  name="fingertip_touch"/>
              </body>
            </body>
          </body>
        </body>
      </body>
    </body>
    <!-- props -->
    <body name="box0" pos=".5 0 .4" childclass="object">
      <joint name="box0_x" type="slide" axis="1 0 0" ref=".5"/>
      <joint name="box0_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="box0_y" type="hinge" axis="0 1 0"/>
      <geom  name="box0" type="box" size=".022 .022 .022" />
      <site  name="box0" type="sphere"/>
    </body>
    <body name="box1" pos=".4 0 .4" childclass="object">
      <joint name="box1_x" type="slide" axis="1 0 0" ref=".4"/>
      <joint name="box1_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="box1_y" type="hinge" axis="0 1 0"/>
      <geom  name="box1" type="box" size=".022 .022 .022" />
      <site  name="box1" type="sphere"/>
    </body>
    <body name="box2" pos=".3 0 .4" childclass="object">
      <joint name="box2_x" type="slide" axis="1 0 0" ref=".3"/>
      <joint name="box2_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="box2_y" type="hinge" axis="0 1 0"/>
      <geom  name="box2" type="box" size=".022 .022 .022" />
      <site  name="box2" type="sphere"/>
    </body>
    <body name="box3" pos=".2 0 .4" childclass="object">
      <joint name="box3_x" type="slide" axis="1 0 0" ref=".2"/>
      <joint name="box3_z" type="slide" axis="0 0 1" ref=".4"/>
      <joint name="box3_y" type="hinge" axis="0 1 0"/>
      <geom  name="box3" type="box" size=".022 .022 .022" />
      <site  name="box3" type="sphere"/>
    </body>
    <!-- targets -->
    <body name="target" pos="0 .001 .022" childclass="ghost">
      <geom  name="target" type="box" size=".022 .022 .022" />
      <site  name="target" type="sphere"/>
    </body>
  </worldbody>
  <tendon>
    <fixed name="grasp">
      <joint joint="thumb"  coef=".5"/>
      <joint joint="finger" coef=".5"/>
    </fixed>
    <fixed name="coupling">
      <joint joint="thumb"  coef="-.5"/>
      <joint joint="finger" coef=".5"/>
    </fixed>
  </tendon>
  <equality>
    <tendon name="coupling" tendon1="coupling" solimp="0.95 0.99 0.001" solref=".005 .5"/>
  </equality>
  <sensor>
    <touch name="palm_touch" site="palm_touch"/>
    <touch name="finger_touch" site="finger_touch"/>
    <touch name="thumb_touch" site="thumb_touch"/>
    <touch name="fingertip_touch" site="fingertip_touch"/>
    <touch name="thumbtip_touch" site="thumbtip_touch"/>
  </sensor>
  <actuator>
    <motor name="root"     joint="arm_root"     ctrlrange="-1 1"  gear="12"/>
    <motor name="shoulder" joint="arm_shoulder" ctrlrange="-1 1"  gear="8"/>
    <motor name="elbow"    joint="arm_elbow"    ctrlrange="-1 1"  gear="4"/>
    <motor name="wrist"    joint="arm_wrist"    ctrlrange="-1 1"  gear="2"/>
    <motor name="grasp"    tendon="grasp"       ctrlrange="-1 1"  gear="2"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/swimmer.py
+++ b/Dreamer/local_dm_control_suite/swimmer.py
@ -0,0 +1,215 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Procedurally generated Swimmer domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 from lxml import etree
 import numpy as np
 from six.moves import range
 _DEFAULT_TIME_LIMIT = 30
 _CONTROL_TIMESTEP = .03  # (Seconds)
 SUITE = containers.TaggedTasks()
 def get_model_and_assets(n_joints):
  """Returns a tuple containing the model XML string and a dict of assets.
  Args:
    n_joints: An integer specifying the number of joints in the swimmer.
  Returns:
    A tuple `(model_xml_string, assets)`, where `assets` is a dict consisting of
    `{filename: contents_string}` pairs.
  """
  return _make_model(n_joints), common.ASSETS
@SUITE.add('benchmarking')
 def swimmer6(time_limit=_DEFAULT_TIME_LIMIT, random=None,
             environment_kwargs=None):
  """Returns a 6-link swimmer."""
  return _make_swimmer(6, time_limit, random=random,
                       environment_kwargs=environment_kwargs)
@SUITE.add('benchmarking')
 def swimmer15(time_limit=_DEFAULT_TIME_LIMIT, random=None,
              environment_kwargs=None):
  """Returns a 15-link swimmer."""
  return _make_swimmer(15, time_limit, random=random,
                       environment_kwargs=environment_kwargs)
 def swimmer(n_links=3, time_limit=_DEFAULT_TIME_LIMIT,
            random=None, environment_kwargs=None):
  """Returns a swimmer with n links."""
  return _make_swimmer(n_links, time_limit, random=random,
                       environment_kwargs=environment_kwargs)
 def _make_swimmer(n_joints, time_limit=_DEFAULT_TIME_LIMIT, random=None,
                  environment_kwargs=None):
  """Returns a swimmer control environment."""
  model_string, assets = get_model_and_assets(n_joints)
  physics = Physics.from_xml_string(model_string, assets=assets)
  task = Swimmer(random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 def _make_model(n_bodies):
  """Generates an xml string defining a swimmer with `n_bodies` bodies."""
  if n_bodies < 3:
    raise ValueError('At least 3 bodies required. Received {}'.format(n_bodies))
  mjcf = etree.fromstring(common.read_model('swimmer.xml'))
  head_body = mjcf.find('./worldbody/body')
  actuator = etree.SubElement(mjcf, 'actuator')
  sensor = etree.SubElement(mjcf, 'sensor')
  parent = head_body
  for body_index in range(n_bodies - 1):
    site_name = 'site_{}'.format(body_index)
    child = _make_body(body_index=body_index)
    child.append(etree.Element('site', name=site_name))
    joint_name = 'joint_{}'.format(body_index)
    joint_limit = 360.0/n_bodies
    joint_range = '{} {}'.format(-joint_limit, joint_limit)
    child.append(etree.Element('joint', {'name': joint_name,
                                         'range': joint_range}))
    motor_name = 'motor_{}'.format(body_index)
    actuator.append(etree.Element('motor', name=motor_name, joint=joint_name))
    velocimeter_name = 'velocimeter_{}'.format(body_index)
    sensor.append(etree.Element('velocimeter', name=velocimeter_name,
                                site=site_name))
    gyro_name = 'gyro_{}'.format(body_index)
    sensor.append(etree.Element('gyro', name=gyro_name, site=site_name))
    parent.append(child)
    parent = child
  # Move tracking cameras further away from the swimmer according to its length.
  cameras = mjcf.findall('./worldbody/body/camera')
  scale = n_bodies / 6.0
  for cam in cameras:
    if cam.get('mode') == 'trackcom':
      old_pos = cam.get('pos').split(' ')
      new_pos = ' '.join([str(float(dim) * scale) for dim in old_pos])
      cam.set('pos', new_pos)
  return etree.tostring(mjcf, pretty_print=True)
 def _make_body(body_index):
  """Generates an xml string defining a single physical body."""
  body_name = 'segment_{}'.format(body_index)
  visual_name = 'visual_{}'.format(body_index)
  inertial_name = 'inertial_{}'.format(body_index)
  body = etree.Element('body', name=body_name)
  body.set('pos', '0 .1 0')
  etree.SubElement(body, 'geom', {'class': 'visual', 'name': visual_name})
  etree.SubElement(body, 'geom', {'class': 'inertial', 'name': inertial_name})
  return body
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the swimmer domain."""
  def nose_to_target(self):
    """Returns a vector from nose to target in local coordinate of the head."""
    nose_to_target = (self.named.data.geom_xpos['target'] -
                      self.named.data.geom_xpos['nose'])
    head_orientation = self.named.data.xmat['head'].reshape(3, 3)
    return nose_to_target.dot(head_orientation)[:2]
  def nose_to_target_dist(self):
    """Returns the distance from the nose to the target."""
    return np.linalg.norm(self.nose_to_target())
  def body_velocities(self):
    """Returns local body velocities: x,y linear, z rotational."""
    xvel_local = self.data.sensordata[12:].reshape((-1, 6))
    vx_vy_wz = [0, 1, 5]  # Indices for linear x,y vels and rotational z vel.
    return xvel_local[:, vx_vy_wz].ravel()
  def joints(self):
    """Returns all internal joint angles (excluding root joints)."""
    return self.data.qpos[3:].copy()
 class Swimmer(base.Task):
  """A swimmer `Task` to reach the target or just swim."""
  def __init__(self, random=None):
    """Initializes an instance of `Swimmer`.
    Args:
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    super(Swimmer, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    Initializes the swimmer orientation to [-pi, pi) and the relative joint
    angle of each joint uniformly within its range.
    Args:
      physics: An instance of `Physics`.
    """
    # Random joint angles:
    randomizers.randomize_limited_and_rotational_joints(physics, self.random)
    # Random target position.
    close_target = self.random.rand() < .2  # Probability of a close target.
    target_box = .3 if close_target else 2
    xpos, ypos = self.random.uniform(-target_box, target_box, size=2)
    physics.named.model.geom_pos['target', 'x'] = xpos
    physics.named.model.geom_pos['target', 'y'] = ypos
    physics.named.model.light_pos['target_light', 'x'] = xpos
    physics.named.model.light_pos['target_light', 'y'] = ypos
    super(Swimmer, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of joint angles, body velocities and target."""
    obs = collections.OrderedDict()
    obs['joints'] = physics.joints()
    obs['to_target'] = physics.nose_to_target()
    obs['body_velocities'] = physics.body_velocities()
    return obs
  def get_reward(self, physics):
    """Returns a smooth reward."""
    target_size = physics.named.model.geom_size['target', 0]
    return rewards.tolerance(physics.nose_to_target_dist(),
                             bounds=(0, target_size),
                             margin=5*target_size,
                             sigmoid='long_tail')
--- a/Dreamer/local_dm_control_suite/swimmer.xml
+++ b/Dreamer/local_dm_control_suite/swimmer.xml
@ -0,0 +1,57 @@
 <mujoco model="swimmer">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials.xml"/>
  <option timestep="0.002" density="3000">
    <flag contact="disable"/>
  </option>
  <default>
    <default class="swimmer">
      <joint type="hinge" pos="0 -.05 0" axis="0 0 1" limited="true" solreflimit=".05 1" solimplimit="0 .8 .1" armature="1e-6"/>
      <default class="inertial">
        <geom type="box" size=".001 .05 .01" rgba="0 0 0 0" mass=".01"/>
      </default>
      <default class="visual">
        <geom type="capsule" size=".01" fromto="0 -.05 0 0 .05 0" material="self" mass="0"/>
      </default>
      <site size=".01" rgba="0 0 0 0"/>
    </default>
    <default class="free">
      <joint limited="false" stiffness="0" armature="0"/>
    </default>
    <motor gear="5e-4" ctrllimited="true" ctrlrange="-1 1"/>
  </default>
  <worldbody>
    <geom name="ground" type="plane" size="2 2 0.1" material="grid"/>
    <body name="head" pos="0 0 .05" childclass="swimmer">
      <light name="light_1" diffuse=".8 .8 .8" pos="0 0 1.5"/>
      <geom name="head" type="ellipsoid" size=".02 .04 .017" pos="0 -.022 0"  material="self" mass="0"/>
      <geom name="nose" type="sphere" pos="0 -.06 0" size=".004" material="effector" mass="0"/>
      <geom name="eyes" type="capsule" fromto="-.006 -.054 .005 .006 -.054 .005" size=".004" material="eye" mass="0"/>
      <camera name="tracking1" pos="0 -.2 .5" xyaxes="1 0 0 0 1 1" mode="trackcom" fovy="60"/>
      <camera name="tracking2" pos="-.9 .5 .15" xyaxes="0 -1 0 .3 0 1" mode="trackcom" fovy="60"/>
      <camera name="eyes" pos="0 -.058 .005" xyaxes="-1 0 0 0 0 1"/>
      <joint name="rootx" class="free" type="slide" axis="1 0 0" pos="0 -.05 0"/>
      <joint name="rooty" class="free" type="slide" axis="0 1 0" pos="0 -.05 0"/>
      <joint name="rootz" class="free" type="hinge" axis="0 0 1" pos="0 -.05 0"/>
      <geom name="inertial" class="inertial"/>
      <geom name="visual" class="visual"/>
      <site name="head"/>
    </body>
    <geom name="target" type="sphere" pos="1 1 .05" size=".1" material="target"/>
    <light name="target_light" diffuse="1 1 1" pos="1 1 1.5"/>
  </worldbody>
  <sensor>
    <framepos name="nose_pos" objtype="geom" objname="nose"/>
    <framepos name="target_pos" objtype="geom" objname="target"/>
    <framexaxis name="head_xaxis" objtype="xbody" objname="head"/>
    <frameyaxis name="head_yaxis" objtype="xbody" objname="head"/>
    <velocimeter name="head_vel" site="head"/>
    <gyro name="head_gyro" site="head"/>
  </sensor>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/tests/domains_test.py
+++ b/Dreamer/local_dm_control_suite/tests/domains_test.py
@ -0,0 +1,292 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for dm_control.suite domains."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 # Internal dependencies.
 from absl.testing import absltest
 from absl.testing import parameterized
 from dm_control import suite
 from dm_control.rl import control
 import mock
 import numpy as np
 import six
 from six.moves import range
 from six.moves import zip
 def uniform_random_policy(action_spec, random=None):
  lower_bounds = action_spec.minimum
  upper_bounds = action_spec.maximum
  # Draw values between -1 and 1 for unbounded actions.
  lower_bounds = np.where(np.isinf(lower_bounds), -1.0, lower_bounds)
  upper_bounds = np.where(np.isinf(upper_bounds), 1.0, upper_bounds)
  random_state = np.random.RandomState(random)
  def policy(time_step):
    del time_step  # Unused.
    return random_state.uniform(lower_bounds, upper_bounds)
  return policy
 def step_environment(env, policy, num_episodes=5, max_steps_per_episode=10):
  for _ in range(num_episodes):
    step_count = 0
    time_step = env.reset()
    yield time_step
    while not time_step.last():
      action = policy(time_step)
      time_step = env.step(action)
      step_count += 1
      yield time_step
      if step_count >= max_steps_per_episode:
        break
 def make_trajectory(domain, task, seed, **trajectory_kwargs):
  env = suite.load(domain, task, task_kwargs={'random': seed})
  policy = uniform_random_policy(env.action_spec(), random=seed)
  return step_environment(env, policy, **trajectory_kwargs)
 class DomainTest(parameterized.TestCase):
  """Tests run on all the tasks registered."""
  def test_constants(self):
    num_tasks = sum(len(tasks) for tasks in
                    six.itervalues(suite.TASKS_BY_DOMAIN))
    self.assertLen(suite.ALL_TASKS, num_tasks)
  def _validate_observation(self, observation_dict, observation_spec):
    obs = observation_dict.copy()
    for name, spec in six.iteritems(observation_spec):
      arr = obs.pop(name)
      self.assertEqual(arr.shape, spec.shape)
      self.assertEqual(arr.dtype, spec.dtype)
      self.assertTrue(
          np.all(np.isfinite(arr)),
          msg='{!r} has non-finite value(s): {!r}'.format(name, arr))
    self.assertEmpty(
        obs,
        msg='Observation contains arrays(s) that are not in the spec: {!r}'
        .format(obs))
  def _validate_reward_range(self, time_step):
    if time_step.first():
      self.assertIsNone(time_step.reward)
    else:
      self.assertIsInstance(time_step.reward, float)
      self.assertBetween(time_step.reward, 0, 1)
  def _validate_discount(self, time_step):
    if time_step.first():
      self.assertIsNone(time_step.discount)
    else:
      self.assertIsInstance(time_step.discount, float)
      self.assertBetween(time_step.discount, 0, 1)
  def _validate_control_range(self, lower_bounds, upper_bounds):
    for b in lower_bounds:
      self.assertEqual(b, -1.0)
    for b in upper_bounds:
      self.assertEqual(b, 1.0)
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_components_have_names(self, domain, task):
    env = suite.load(domain, task)
    model = env.physics.model
    object_types_and_size_fields = [
        ('body', 'nbody'),
        ('joint', 'njnt'),
        ('geom', 'ngeom'),
        ('site', 'nsite'),
        ('camera', 'ncam'),
        ('light', 'nlight'),
        ('mesh', 'nmesh'),
        ('hfield', 'nhfield'),
        ('texture', 'ntex'),
        ('material', 'nmat'),
        ('equality', 'neq'),
        ('tendon', 'ntendon'),
        ('actuator', 'nu'),
        ('sensor', 'nsensor'),
        ('numeric', 'nnumeric'),
        ('text', 'ntext'),
        ('tuple', 'ntuple'),
    ]
    for object_type, size_field in object_types_and_size_fields:
      for idx in range(getattr(model, size_field)):
        object_name = model.id2name(idx, object_type)
        self.assertNotEqual(object_name, '',
                            msg='Model {!r} contains unnamed {!r} with ID {}.'
                            .format(model.name, object_type, idx))
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_model_has_at_least_2_cameras(self, domain, task):
    env = suite.load(domain, task)
    model = env.physics.model
    self.assertGreaterEqual(model.ncam, 2,
                            'Model {!r} should have at least 2 cameras, has {}.'
                            .format(model.name, model.ncam))
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_task_conforms_to_spec(self, domain, task):
    """Tests that the environment timesteps conform to specifications."""
    is_benchmark = (domain, task) in suite.BENCHMARKING
    env = suite.load(domain, task)
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()
    # Check action bounds.
    if is_benchmark:
      self._validate_control_range(action_spec.minimum, action_spec.maximum)
    # Step through the environment, applying random actions sampled within the
    # valid range and check the observations, rewards, and discounts.
    policy = uniform_random_policy(action_spec)
    for time_step in step_environment(env, policy):
      self._validate_observation(time_step.observation, observation_spec)
      self._validate_discount(time_step)
      if is_benchmark:
        self._validate_reward_range(time_step)
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_environment_is_deterministic(self, domain, task):
    """Tests that identical seeds and actions produce identical trajectories."""
    seed = 0
    # Iterate over two trajectories generated using identical sequences of
    # random actions, and with identical task random states. Check that the
    # observations, rewards, discounts and step types are identical.
    trajectory1 = make_trajectory(domain=domain, task=task, seed=seed)
    trajectory2 = make_trajectory(domain=domain, task=task, seed=seed)
    for time_step1, time_step2 in zip(trajectory1, trajectory2):
      self.assertEqual(time_step1.step_type, time_step2.step_type)
      self.assertEqual(time_step1.reward, time_step2.reward)
      self.assertEqual(time_step1.discount, time_step2.discount)
      for key in six.iterkeys(time_step1.observation):
        np.testing.assert_array_equal(
            time_step1.observation[key], time_step2.observation[key],
            err_msg='Observation {!r} is not equal.'.format(key))
  def assertCorrectColors(self, physics, reward):
    colors = physics.named.model.mat_rgba
    for material_name in ('self', 'effector', 'target'):
      highlight = colors[material_name + '_highlight']
      default = colors[material_name + '_default']
      blend_coef = reward ** 4
      expected = blend_coef * highlight + (1.0 - blend_coef) * default
      actual = colors[material_name]
      err_msg = ('Material {!r} has unexpected color.\nExpected: {!r}\n'
                 'Actual: {!r}'.format(material_name, expected, actual))
      np.testing.assert_array_almost_equal(expected, actual, err_msg=err_msg)
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_visualize_reward(self, domain, task):
    env = suite.load(domain, task)
    env.task.visualize_reward = True
    action = np.zeros(env.action_spec().shape)
    with mock.patch.object(env.task, 'get_reward') as mock_get_reward:
      mock_get_reward.return_value = -3.0  # Rewards < 0 should be clipped.
      env.reset()
      mock_get_reward.assert_called_with(env.physics)
      self.assertCorrectColors(env.physics, reward=0.0)
      mock_get_reward.reset_mock()
      mock_get_reward.return_value = 0.5
      env.step(action)
      mock_get_reward.assert_called_with(env.physics)
      self.assertCorrectColors(env.physics, reward=mock_get_reward.return_value)
      mock_get_reward.reset_mock()
      mock_get_reward.return_value = 2.0  # Rewards > 1 should be clipped.
      env.step(action)
      mock_get_reward.assert_called_with(env.physics)
      self.assertCorrectColors(env.physics, reward=1.0)
      mock_get_reward.reset_mock()
      mock_get_reward.return_value = 0.25
      env.reset()
      mock_get_reward.assert_called_with(env.physics)
      self.assertCorrectColors(env.physics, reward=mock_get_reward.return_value)
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_task_supports_environment_kwargs(self, domain, task):
    env = suite.load(domain, task,
                     environment_kwargs=dict(flat_observation=True))
    # Check that the kwargs are actually passed through to the environment.
    self.assertSetEqual(set(env.observation_spec()),
                        {control.FLAT_OBSERVATION_KEY})
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_observation_arrays_dont_share_memory(self, domain, task):
    env = suite.load(domain, task)
    first_timestep = env.reset()
    action = np.zeros(env.action_spec().shape)
    second_timestep = env.step(action)
    for name, first_array in six.iteritems(first_timestep.observation):
      second_array = second_timestep.observation[name]
      self.assertFalse(
          np.may_share_memory(first_array, second_array),
          msg='Consecutive observations of {!r} may share memory.'.format(name))
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_observations_dont_contain_constant_elements(self, domain, task):
    env = suite.load(domain, task)
    trajectory = make_trajectory(domain=domain, task=task, seed=0,
                                 num_episodes=2, max_steps_per_episode=1000)
    observations = {name: [] for name in env.observation_spec()}
    for time_step in trajectory:
      for name, array in six.iteritems(time_step.observation):
        observations[name].append(array)
    failures = []
    for name, array_list in six.iteritems(observations):
      # Sampling random uniform actions generally isn't sufficient to trigger
      # these touch sensors.
      if (domain in ('manipulator', 'stacker') and name == 'touch' or
          domain == 'quadruped' and name == 'force_torque'):
        continue
      stacked_arrays = np.array(array_list)
      is_constant = np.all(stacked_arrays == stacked_arrays[0], axis=0)
      has_constant_elements = (
          is_constant if np.isscalar(is_constant) else np.any(is_constant))
      if has_constant_elements:
        failures.append((name, is_constant))
    self.assertEmpty(
        failures,
        msg='The following observation(s) contain constant elements:\n{}'
        .format('\n'.join(':\t'.join([name, str(is_constant)])
                          for (name, is_constant) in failures)))
  @parameterized.parameters(*suite.ALL_TASKS)
  def test_initial_state_is_randomized(self, domain, task):
    env = suite.load(domain, task, task_kwargs={'random': 42})
    obs1 = env.reset().observation
    obs2 = env.reset().observation
    self.assertFalse(
        all(np.all(obs1[k] == obs2[k]) for k in obs1),
        'Two consecutive initial states have identical observations.\n'
        'First: {}\nSecond: {}'.format(obs1, obs2))
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/tests/loader_test.py
+++ b/Dreamer/local_dm_control_suite/tests/loader_test.py
@ -0,0 +1,52 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for the dm_control.suite loader."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 # Internal dependencies.
 from absl.testing import absltest
 from dm_control import suite
 from dm_control.rl import control
 class LoaderTest(absltest.TestCase):
  def test_load_without_kwargs(self):
    env = suite.load('cartpole', 'swingup')
    self.assertIsInstance(env, control.Environment)
  def test_load_with_kwargs(self):
    env = suite.load('cartpole', 'swingup',
                     task_kwargs={'time_limit': 40, 'random': 99})
    self.assertIsInstance(env, control.Environment)
 class LoaderConstantsTest(absltest.TestCase):
  def testSuiteConstants(self):
    self.assertNotEmpty(suite.BENCHMARKING)
    self.assertNotEmpty(suite.EASY)
    self.assertNotEmpty(suite.HARD)
    self.assertNotEmpty(suite.EXTRA)
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/tests/lqr_test.py
+++ b/Dreamer/local_dm_control_suite/tests/lqr_test.py
@ -0,0 +1,88 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests specific to the LQR domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import  math
 import unittest
 # Internal dependencies.
 from absl import logging
 from absl.testing import absltest
 from absl.testing import parameterized
 from local_dm_control_suite import lqr
 from local_dm_control_suite import lqr_solver
 import numpy as np
 from six.moves import range
 class LqrTest(parameterized.TestCase):
  @parameterized.named_parameters(
      ('lqr_2_1', lqr.lqr_2_1),
      ('lqr_6_2', lqr.lqr_6_2))
  def test_lqr_optimal_policy(self, make_env):
    env = make_env()
    p, k, beta = lqr_solver.solve(env)
    self.assertPolicyisOptimal(env, p, k, beta)
  @parameterized.named_parameters(
      ('lqr_2_1', lqr.lqr_2_1),
      ('lqr_6_2', lqr.lqr_6_2))
  @unittest.skipUnless(
      condition=lqr_solver.sp,
      reason='scipy is not available, so non-scipy DARE solver is the default.')
  def test_lqr_optimal_policy_no_scipy(self, make_env):
    env = make_env()
    old_sp = lqr_solver.sp
    try:
      lqr_solver.sp = None  # Force the solver to use the non-scipy code path.
      p, k, beta = lqr_solver.solve(env)
    finally:
      lqr_solver.sp = old_sp
    self.assertPolicyisOptimal(env, p, k, beta)
  def assertPolicyisOptimal(self, env, p, k, beta):
    tolerance = 1e-3
    n_steps = int(math.ceil(math.log10(tolerance) / math.log10(beta)))
    logging.info('%d timesteps for %g convergence.', n_steps, tolerance)
    total_loss = 0.0
    timestep = env.reset()
    initial_state = np.hstack((timestep.observation['position'],
                               timestep.observation['velocity']))
    logging.info('Measuring total cost over %d steps.', n_steps)
    for _ in range(n_steps):
      x = np.hstack((timestep.observation['position'],
                     timestep.observation['velocity']))
      # u = k*x is the optimal policy
      u = k.dot(x)
      total_loss += 1 - (timestep.reward or 0.0)
      timestep = env.step(u)
    logging.info('Analytical expected total cost is .5*x^T*p*x.')
    expected_loss = .5 * initial_state.T.dot(p).dot(initial_state)
    logging.info('Comparing measured and predicted costs.')
    np.testing.assert_allclose(expected_loss, total_loss, rtol=tolerance)
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/utils/init.py
+++ b/Dreamer/local_dm_control_suite/utils/init.py
@ -0,0 +1,16 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Utility functions used in the control suite."""
--- a/Dreamer/local_dm_control_suite/utils/parse_amc.py
+++ b/Dreamer/local_dm_control_suite/utils/parse_amc.py
@ -0,0 +1,251 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Parse and convert amc motion capture data."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control.mujoco.wrapper import mjbindings
 import numpy as np
 from scipy import interpolate
 from six.moves import range
 mjlib = mjbindings.mjlib
 MOCAP_DT = 1.0/120.0
 CONVERSION_LENGTH = 0.056444
 _CMU_MOCAP_JOINT_ORDER = (
    'root0', 'root1', 'root2', 'root3', 'root4', 'root5', 'lowerbackrx',
    'lowerbackry', 'lowerbackrz', 'upperbackrx', 'upperbackry', 'upperbackrz',
    'thoraxrx', 'thoraxry', 'thoraxrz', 'lowerneckrx', 'lowerneckry',
    'lowerneckrz', 'upperneckrx', 'upperneckry', 'upperneckrz', 'headrx',
    'headry', 'headrz', 'rclaviclery', 'rclaviclerz', 'rhumerusrx',
    'rhumerusry', 'rhumerusrz', 'rradiusrx', 'rwristry', 'rhandrx', 'rhandrz',
    'rfingersrx', 'rthumbrx', 'rthumbrz', 'lclaviclery', 'lclaviclerz',
    'lhumerusrx', 'lhumerusry', 'lhumerusrz', 'lradiusrx', 'lwristry',
    'lhandrx', 'lhandrz', 'lfingersrx', 'lthumbrx', 'lthumbrz', 'rfemurrx',
    'rfemurry', 'rfemurrz', 'rtibiarx', 'rfootrx', 'rfootrz', 'rtoesrx',
    'lfemurrx', 'lfemurry', 'lfemurrz', 'ltibiarx', 'lfootrx', 'lfootrz',
    'ltoesrx'
 )
 Converted = collections.namedtuple('Converted',
                                   ['qpos', 'qvel', 'time'])
 def convert(file_name, physics, timestep):
  """Converts the parsed .amc values into qpos and qvel values and resamples.
  Args:
    file_name: The .amc file to be parsed and converted.
    physics: The corresponding physics instance.
    timestep: Desired output interval between resampled frames.
  Returns:
    A namedtuple with fields:
        `qpos`, a numpy array containing converted positional variables.
        `qvel`, a numpy array containing converted velocity variables.
        `time`, a numpy array containing the corresponding times.
  """
  frame_values = parse(file_name)
  joint2index = {}
  for name in physics.named.data.qpos.axes.row.names:
    joint2index[name] = physics.named.data.qpos.axes.row.convert_key_item(name)
  index2joint = {}
  for joint, index in joint2index.items():
    if isinstance(index, slice):
      indices = range(index.start, index.stop)
    else:
      indices = [index]
    for ii in indices:
      index2joint[ii] = joint
  # Convert frame_values to qpos
  amcvals2qpos_transformer = Amcvals2qpos(index2joint, _CMU_MOCAP_JOINT_ORDER)
  qpos_values = []
  for frame_value in frame_values:
    qpos_values.append(amcvals2qpos_transformer(frame_value))
  qpos_values = np.stack(qpos_values)  # Time by nq
  # Interpolate/resample.
  # Note: interpolate quaternions rather than euler angles (slerp).
  # see https://en.wikipedia.org/wiki/Slerp
  qpos_values_resampled = []
  time_vals = np.arange(0, len(frame_values)*MOCAP_DT - 1e-8, MOCAP_DT)
  time_vals_new = np.arange(0, len(frame_values)*MOCAP_DT, timestep)
  while time_vals_new[-1] > time_vals[-1]:
    time_vals_new = time_vals_new[:-1]
  for i in range(qpos_values.shape[1]):
    f = interpolate.splrep(time_vals, qpos_values[:, i])
    qpos_values_resampled.append(interpolate.splev(time_vals_new, f))
  qpos_values_resampled = np.stack(qpos_values_resampled)  # nq by ntime
  qvel_list = []
  for t in range(qpos_values_resampled.shape[1]-1):
    p_tp1 = qpos_values_resampled[:, t + 1]
    p_t = qpos_values_resampled[:, t]
    qvel = [(p_tp1[:3]-p_t[:3])/ timestep,
            mj_quat2vel(mj_quatdiff(p_t[3:7], p_tp1[3:7]), timestep),
            (p_tp1[7:]-p_t[7:])/ timestep]
    qvel_list.append(np.concatenate(qvel))
  qvel_values_resampled = np.vstack(qvel_list).T
  return Converted(qpos_values_resampled, qvel_values_resampled, time_vals_new)
 def parse(file_name):
  """Parses the amc file format."""
  values = []
  fid = open(file_name, 'r')
  line = fid.readline().strip()
  frame_ind = 1
  first_frame = True
  while True:
    # Parse first frame.
    if first_frame and line[0] == str(frame_ind):
      first_frame = False
      frame_ind += 1
      frame_vals = []
      while True:
        line = fid.readline().strip()
        if not line or line == str(frame_ind):
          values.append(np.array(frame_vals, dtype=np.float))
          break
        tokens = line.split()
        frame_vals.extend(tokens[1:])
    # Parse other frames.
    elif line == str(frame_ind):
      frame_ind += 1
      frame_vals = []
      while True:
        line = fid.readline().strip()
        if not line or line == str(frame_ind):
          values.append(np.array(frame_vals, dtype=np.float))
          break
        tokens = line.split()
        frame_vals.extend(tokens[1:])
    else:
      line = fid.readline().strip()
      if not line:
        break
  return values
 class Amcvals2qpos(object):
  """Callable that converts .amc values for a frame and to MuJoCo qpos format.
  """
  def __init__(self, index2joint, joint_order):
    """Initializes a new Amcvals2qpos instance.
    Args:
      index2joint: List of joint angles in .amc file.
      joint_order: List of joint names in MuJoco MJCF.
    """
    # Root is x,y,z, then quat.
    # need to get indices of qpos that order for amc default order
    self.qpos_root_xyz_ind = [0, 1, 2]
    self.root_xyz_ransform = np.array(
        [[1, 0, 0], [0, 0, -1], [0, 1, 0]]) * CONVERSION_LENGTH
    self.qpos_root_quat_ind = [3, 4, 5, 6]
    amc2qpos_transform = np.zeros((len(index2joint), len(joint_order)))
    for i in range(len(index2joint)):
      for j in range(len(joint_order)):
        if index2joint[i] == joint_order[j]:
          if 'rx' in index2joint[i]:
            amc2qpos_transform[i][j] = 1
          elif 'ry' in index2joint[i]:
            amc2qpos_transform[i][j] = 1
          elif 'rz' in index2joint[i]:
            amc2qpos_transform[i][j] = 1
    self.amc2qpos_transform = amc2qpos_transform
  def __call__(self, amc_val):
    """Converts a `.amc` frame to MuJoCo qpos format."""
    amc_val_rad = np.deg2rad(amc_val)
    qpos = np.dot(self.amc2qpos_transform, amc_val_rad)
    # Root.
    qpos[:3] = np.dot(self.root_xyz_ransform, amc_val[:3])
    qpos_quat = euler2quat(amc_val[3], amc_val[4], amc_val[5])
    qpos_quat = mj_quatprod(euler2quat(90, 0, 0), qpos_quat)
    for i, ind in enumerate(self.qpos_root_quat_ind):
      qpos[ind] = qpos_quat[i]
    return qpos
 def euler2quat(ax, ay, az):
  """Converts euler angles to a quaternion.
  Note: rotation order is zyx
  Args:
    ax: Roll angle (deg)
    ay: Pitch angle (deg).
    az: Yaw angle (deg).
  Returns:
    A numpy array representing the rotation as a quaternion.
  """
  r1 = az
  r2 = ay
  r3 = ax
  c1 = np.cos(np.deg2rad(r1 / 2))
  s1 = np.sin(np.deg2rad(r1 / 2))
  c2 = np.cos(np.deg2rad(r2 / 2))
  s2 = np.sin(np.deg2rad(r2 / 2))
  c3 = np.cos(np.deg2rad(r3 / 2))
  s3 = np.sin(np.deg2rad(r3 / 2))
  q0 = c1 * c2 * c3 + s1 * s2 * s3
  q1 = c1 * c2 * s3 - s1 * s2 * c3
  q2 = c1 * s2 * c3 + s1 * c2 * s3
  q3 = s1 * c2 * c3 - c1 * s2 * s3
  return np.array([q0, q1, q2, q3])
 def mj_quatprod(q, r):
  quaternion = np.zeros(4)
  mjlib.mju_mulQuat(quaternion, np.ascontiguousarray(q),
                    np.ascontiguousarray(r))
  return quaternion
 def mj_quat2vel(q, dt):
  vel = np.zeros(3)
  mjlib.mju_quat2Vel(vel, np.ascontiguousarray(q), dt)
  return vel
 def mj_quatneg(q):
  quaternion = np.zeros(4)
  mjlib.mju_negQuat(quaternion, np.ascontiguousarray(q))
  return quaternion
 def mj_quatdiff(source, target):
  return mj_quatprod(mj_quatneg(source), np.ascontiguousarray(target))
--- a/Dreamer/local_dm_control_suite/utils/parse_amc_test.py
+++ b/Dreamer/local_dm_control_suite/utils/parse_amc_test.py
@ -0,0 +1,68 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for parse_amc utility."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 # Internal dependencies.
 from absl.testing import absltest
 from local_dm_control_suite import humanoid_CMU
 from dm_control.suite.utils import parse_amc
 from dm_control.utils import io as resources
 _TEST_AMC_PATH = resources.GetResourceFilename(
    os.path.join(os.path.dirname(__file__), '../demos/zeros.amc'))
 class ParseAMCTest(absltest.TestCase):
  def test_sizes_of_parsed_data(self):
    # Instantiate the humanoid environment.
    env = humanoid_CMU.stand()
    # Parse and convert specified clip.
    converted = parse_amc.convert(
        _TEST_AMC_PATH, env.physics, env.control_timestep())
    self.assertEqual(converted.qpos.shape[0], 63)
    self.assertEqual(converted.qvel.shape[0], 62)
    self.assertEqual(converted.time.shape[0], converted.qpos.shape[1])
    self.assertEqual(converted.qpos.shape[1],
                     converted.qvel.shape[1] + 1)
    # Parse and convert specified clip -- WITH SMALLER TIMESTEP
    converted2 = parse_amc.convert(
        _TEST_AMC_PATH, env.physics, 0.5 * env.control_timestep())
    self.assertEqual(converted2.qpos.shape[0], 63)
    self.assertEqual(converted2.qvel.shape[0], 62)
    self.assertEqual(converted2.time.shape[0], converted2.qpos.shape[1])
    self.assertEqual(converted.qpos.shape[1],
                     converted.qvel.shape[1] + 1)
    # Compare sizes of parsed objects for different timesteps
    self.assertEqual(converted.qpos.shape[1] * 2, converted2.qpos.shape[1])
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/utils/randomizers.py
+++ b/Dreamer/local_dm_control_suite/utils/randomizers.py
@ -0,0 +1,91 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Randomization functions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from dm_control.mujoco.wrapper import mjbindings
 import numpy as np
 from six.moves import range
 def random_limited_quaternion(random, limit):
  """Generates a random quaternion limited to the specified rotations."""
  axis = random.randn(3)
  axis /= np.linalg.norm(axis)
  angle = random.rand() * limit
  quaternion = np.zeros(4)
  mjbindings.mjlib.mju_axisAngle2Quat(quaternion, axis, angle)
  return quaternion
 def randomize_limited_and_rotational_joints(physics, random=None):
  """Randomizes the positions of joints defined in the physics body.
  The following randomization rules apply:
    - Bounded joints (hinges or sliders) are sampled uniformly in the bounds.
    - Unbounded hinges are samples uniformly in [-pi, pi]
    - Quaternions for unlimited free joints and ball joints are sampled
      uniformly on the unit 3-sphere.
    - Quaternions for limited ball joints are sampled uniformly on a sector
      of the unit 3-sphere.
    - The linear degrees of freedom of free joints are not randomized.
  Args:
    physics: Instance of 'Physics' class that holds a loaded model.
    random: Optional instance of 'np.random.RandomState'. Defaults to the global
      NumPy random state.
  """
  random = random or np.random
  hinge = mjbindings.enums.mjtJoint.mjJNT_HINGE
  slide = mjbindings.enums.mjtJoint.mjJNT_SLIDE
  ball = mjbindings.enums.mjtJoint.mjJNT_BALL
  free = mjbindings.enums.mjtJoint.mjJNT_FREE
  qpos = physics.named.data.qpos
  for joint_id in range(physics.model.njnt):
    joint_name = physics.model.id2name(joint_id, 'joint')
    joint_type = physics.model.jnt_type[joint_id]
    is_limited = physics.model.jnt_limited[joint_id]
    range_min, range_max = physics.model.jnt_range[joint_id]
    if is_limited:
      if joint_type == hinge or joint_type == slide:
        qpos[joint_name] = random.uniform(range_min, range_max)
      elif joint_type == ball:
        qpos[joint_name] = random_limited_quaternion(random, range_max)
    else:
      if joint_type == hinge:
        qpos[joint_name] = random.uniform(-np.pi, np.pi)
      elif joint_type == ball:
        quat = random.randn(4)
        quat /= np.linalg.norm(quat)
        qpos[joint_name] = quat
      elif joint_type == free:
        quat = random.rand(4)
        quat /= np.linalg.norm(quat)
        qpos[joint_name][3:] = quat
--- a/Dreamer/local_dm_control_suite/utils/randomizers_test.py
+++ b/Dreamer/local_dm_control_suite/utils/randomizers_test.py
@ -0,0 +1,164 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for randomizers.py."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 # Internal dependencies.
 from absl.testing import absltest
 from absl.testing import parameterized
 from dm_control import mujoco
 from dm_control.mujoco.wrapper import mjbindings
 from dm_control.suite.utils import randomizers
 import numpy as np
 from six.moves import range
 mjlib = mjbindings.mjlib
 class RandomizeUnlimitedJointsTest(parameterized.TestCase):
  def setUp(self):
    self.rand = np.random.RandomState(100)
  def test_single_joint_of_each_type(self):
    physics = mujoco.Physics.from_xml_string("""<mujoco>
          <default>
            <joint range="0 90" />
          </default>
          <worldbody>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="free" type="free"/>
            </body>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="limited_hinge" type="hinge" limited="true"/>
              <joint name="slide" type="slide"/>
              <joint name="limited_slide" type="slide" limited="true"/>
              <joint name="hinge" type="hinge"/>
            </body>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="ball" type="ball"/>
            </body>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="limited_ball" type="ball" limited="true"/>
            </body>
          </worldbody>
        </mujoco>""")
    randomizers.randomize_limited_and_rotational_joints(physics, self.rand)
    self.assertNotEqual(0., physics.named.data.qpos['hinge'])
    self.assertNotEqual(0., physics.named.data.qpos['limited_hinge'])
    self.assertNotEqual(0., physics.named.data.qpos['limited_slide'])
    self.assertNotEqual(0., np.sum(physics.named.data.qpos['ball']))
    self.assertNotEqual(0., np.sum(physics.named.data.qpos['limited_ball']))
    self.assertNotEqual(0., np.sum(physics.named.data.qpos['free'][3:]))
    # Unlimited slide and the positional part of the free joint remains
    # uninitialized.
    self.assertEqual(0., physics.named.data.qpos['slide'])
    self.assertEqual(0., np.sum(physics.named.data.qpos['free'][:3]))
  def test_multiple_joints_of_same_type(self):
    physics = mujoco.Physics.from_xml_string("""<mujoco>
          <worldbody>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="hinge_1" type="hinge"/>
              <joint name="hinge_2" type="hinge"/>
              <joint name="hinge_3" type="hinge"/>
            </body>
          </worldbody>
        </mujoco>""")
    randomizers.randomize_limited_and_rotational_joints(physics, self.rand)
    self.assertNotEqual(0., physics.named.data.qpos['hinge_1'])
    self.assertNotEqual(0., physics.named.data.qpos['hinge_2'])
    self.assertNotEqual(0., physics.named.data.qpos['hinge_3'])
    self.assertNotEqual(physics.named.data.qpos['hinge_1'],
                        physics.named.data.qpos['hinge_2'])
    self.assertNotEqual(physics.named.data.qpos['hinge_2'],
                        physics.named.data.qpos['hinge_3'])
    self.assertNotEqual(physics.named.data.qpos['hinge_1'],
                        physics.named.data.qpos['hinge_3'])
  def test_unlimited_hinge_randomization_range(self):
    physics = mujoco.Physics.from_xml_string("""<mujoco>
          <worldbody>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="hinge" type="hinge"/>
            </body>
          </worldbody>
        </mujoco>""")
    for _ in range(10):
      randomizers.randomize_limited_and_rotational_joints(physics, self.rand)
      self.assertBetween(physics.named.data.qpos['hinge'], -np.pi, np.pi)
  def test_limited_1d_joint_limits_are_respected(self):
    physics = mujoco.Physics.from_xml_string("""<mujoco>
          <default>
            <joint limited="true"/>
          </default>
          <worldbody>
            <body>
              <geom type="box" size="1 1 1"/>
              <joint name="hinge" type="hinge" range="0 10"/>
              <joint name="slide" type="slide" range="30 50"/>
            </body>
          </worldbody>
        </mujoco>""")
    for _ in range(10):
      randomizers.randomize_limited_and_rotational_joints(physics, self.rand)
      self.assertBetween(physics.named.data.qpos['hinge'],
                         np.deg2rad(0), np.deg2rad(10))
      self.assertBetween(physics.named.data.qpos['slide'], 30, 50)
  def test_limited_ball_joint_are_respected(self):
    physics = mujoco.Physics.from_xml_string("""<mujoco>
          <worldbody>
            <body name="body" zaxis="1 0 0">
              <geom type="box" size="1 1 1"/>
              <joint name="ball" type="ball" limited="true" range="0 60"/>
            </body>
          </worldbody>
        </mujoco>""")
    body_axis = np.array([1., 0., 0.])
    joint_axis = np.zeros(3)
    for _ in range(10):
      randomizers.randomize_limited_and_rotational_joints(physics, self.rand)
      quat = physics.named.data.qpos['ball']
      mjlib.mju_rotVecQuat(joint_axis, body_axis, quat)
      angle_cos = np.dot(body_axis, joint_axis)
      self.assertGreater(angle_cos, 0.5)  # cos(60) = 0.5
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/walker.py
+++ b/Dreamer/local_dm_control_suite/walker.py
@ -0,0 +1,158 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Planar Walker Domain."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from dm_control import mujoco
 from dm_control.rl import control
 from local_dm_control_suite import base
 from local_dm_control_suite import common
 from dm_control.suite.utils import randomizers
 from dm_control.utils import containers
 from dm_control.utils import rewards
 _DEFAULT_TIME_LIMIT = 25
 _CONTROL_TIMESTEP = .025
 # Minimal height of torso over foot above which stand reward is 1.
 _STAND_HEIGHT = 1.2
 # Horizontal speeds (meters/second) above which move reward is 1.
 _WALK_SPEED = 1
 _RUN_SPEED = 8
 SUITE = containers.TaggedTasks()
 def get_model_and_assets():
  """Returns a tuple containing the model XML string and a dict of assets."""
  return common.read_model('walker.xml'), common.ASSETS
@SUITE.add('benchmarking')
 def stand(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Stand task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = PlanarWalker(move_speed=0, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def walk(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Walk task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = PlanarWalker(move_speed=_WALK_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
@SUITE.add('benchmarking')
 def run(time_limit=_DEFAULT_TIME_LIMIT, random=None, environment_kwargs=None):
  """Returns the Run task."""
  physics = Physics.from_xml_string(*get_model_and_assets())
  task = PlanarWalker(move_speed=_RUN_SPEED, random=random)
  environment_kwargs = environment_kwargs or {}
  return control.Environment(
      physics, task, time_limit=time_limit, control_timestep=_CONTROL_TIMESTEP,
      **environment_kwargs)
 class Physics(mujoco.Physics):
  """Physics simulation with additional features for the Walker domain."""
  def torso_upright(self):
    """Returns projection from z-axes of torso to the z-axes of world."""
    return self.named.data.xmat['torso', 'zz']
  def torso_height(self):
    """Returns the height of the torso."""
    return self.named.data.xpos['torso', 'z']
  def horizontal_velocity(self):
    """Returns the horizontal velocity of the center-of-mass."""
    return self.named.data.sensordata['torso_subtreelinvel'][0]
  def orientations(self):
    """Returns planar orientations of all bodies."""
    return self.named.data.xmat[1:, ['xx', 'xz']].ravel()
 class PlanarWalker(base.Task):
  """A planar walker task."""
  def __init__(self, move_speed, random=None):
    """Initializes an instance of `PlanarWalker`.
    Args:
      move_speed: A float. If this value is zero, reward is given simply for
        standing up. Otherwise this specifies a target horizontal velocity for
        the walking task.
      random: Optional, either a `numpy.random.RandomState` instance, an
        integer seed for creating a new `RandomState`, or None to select a seed
        automatically (default).
    """
    self._move_speed = move_speed
    super(PlanarWalker, self).__init__(random=random)
  def initialize_episode(self, physics):
    """Sets the state of the environment at the start of each episode.
    In 'standing' mode, use initial orientation and small velocities.
    In 'random' mode, randomize joint angles and let fall to the floor.
    Args:
      physics: An instance of `Physics`.
    """
    randomizers.randomize_limited_and_rotational_joints(physics, self.random)
    super(PlanarWalker, self).initialize_episode(physics)
  def get_observation(self, physics):
    """Returns an observation of body orientations, height and velocites."""
    obs = collections.OrderedDict()
    obs['orientations'] = physics.orientations()
    obs['height'] = physics.torso_height()
    obs['velocity'] = physics.velocity()
    return obs
  def get_reward(self, physics):
    """Returns a reward to the agent."""
    standing = rewards.tolerance(physics.torso_height(),
                                 bounds=(_STAND_HEIGHT, float('inf')),
                                 margin=_STAND_HEIGHT/2)
    upright = (1 + physics.torso_upright()) / 2
    stand_reward = (3*standing + upright) / 4
    if self._move_speed == 0:
      return stand_reward
    else:
      move_reward = rewards.tolerance(physics.horizontal_velocity(),
                                      bounds=(self._move_speed, float('inf')),
                                      margin=self._move_speed/2,
                                      value_at_margin=0.5,
                                      sigmoid='linear')
      return stand_reward * (5*move_reward + 1) / 6
--- a/Dreamer/local_dm_control_suite/walker.xml
+++ b/Dreamer/local_dm_control_suite/walker.xml
@ -0,0 +1,70 @@
 <mujoco model="planar walker">
  <include file="./common/visual.xml"/>
  <include file="./common/skybox.xml"/>
  <include file="./common/materials_white_floor.xml"/>
  <option timestep="0.0025"/>
  <statistic extent="2" center="0 0 1"/>
  <default>
    <joint damping=".1" armature="0.01" limited="true" solimplimit="0 .99 .01"/>
    <geom contype="1" conaffinity="0" friction=".7 .1 .1"/>
    <motor ctrlrange="-1 1" ctrllimited="true"/>
    <site size="0.01"/>
    <default class="walker">
      <geom material="self" type="capsule"/>
      <joint axis="0 -1 0"/>
    </default>
  </default>
  <worldbody>
    <geom name="floor" type="plane" conaffinity="1" pos="248 0 0" size="250 .8 .2" material="grid" zaxis="0 0 1"/>
    <body name="torso" pos="0 0 1.3" childclass="walker">
      <light name="light" pos="0 0 2" mode="trackcom"/>
      <camera name="side" pos="0 -2 .7" euler="60 0 0" mode="trackcom"/>
      <camera name="back" pos="-2 0 .5" xyaxes="0 -1 0 1 0 3" mode="trackcom"/>
      <joint name="rootz" axis="0 0 1" type="slide" limited="false" armature="0" damping="0"/>
      <joint name="rootx" axis="1 0 0" type="slide" limited="false" armature="0" damping="0"/>
      <joint name="rooty" axis="0 1 0" type="hinge" limited="false" armature="0" damping="0"/>
      <geom name="torso" size="0.07 0.3"/>
      <body name="right_thigh" pos="0 -.05 -0.3">
        <joint name="right_hip" range="-20 100"/>
        <geom name="right_thigh" pos="0 0 -0.225" size="0.05 0.225"/>
        <body name="right_leg" pos="0 0 -0.7">
          <joint name="right_knee" pos="0 0 0.25" range="-150 0"/>
          <geom name="right_leg" size="0.04 0.25"/>
          <body name="right_foot" pos="0.06 0 -0.25">
            <joint name="right_ankle" pos="-0.06 0 0" range="-45 45"/>
            <geom name="right_foot" zaxis="1 0 0" size="0.05 0.1"/>
          </body>
        </body>
      </body>
      <body name="left_thigh" pos="0 .05 -0.3" >
        <joint name="left_hip" range="-20 100"/>
        <geom name="left_thigh" pos="0 0 -0.225" size="0.05 0.225"/>
        <body name="left_leg" pos="0 0 -0.7">
          <joint name="left_knee" pos="0 0 0.25" range="-150 0"/>
          <geom name="left_leg" size="0.04 0.25"/>
          <body name="left_foot" pos="0.06 0 -0.25">
            <joint name="left_ankle" pos="-0.06 0 0" range="-45 45"/>
            <geom name="left_foot" zaxis="1 0 0" size="0.05 0.1"/>
          </body>
        </body>
      </body>
    </body>
  </worldbody>
  <sensor>
    <subtreelinvel name="torso_subtreelinvel" body="torso"/>
  </sensor>
  <actuator>
    <motor name="right_hip" joint="right_hip" gear="100"/>
    <motor name="right_knee" joint="right_knee" gear="50"/>
    <motor name="right_ankle" joint="right_ankle" gear="20"/>
    <motor name="left_hip" joint="left_hip" gear="100"/>
    <motor name="left_knee" joint="left_knee" gear="50"/>
    <motor name="left_ankle" joint="left_ankle" gear="20"/>
  </actuator>
 </mujoco>
--- a/Dreamer/local_dm_control_suite/wrappers/init.py
+++ b/Dreamer/local_dm_control_suite/wrappers/init.py
@ -0,0 +1,16 @@
 # Copyright 2018 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Environment wrappers used to extend or modify environment behaviour."""
--- a/Dreamer/local_dm_control_suite/wrappers/action_noise.py
+++ b/Dreamer/local_dm_control_suite/wrappers/action_noise.py
@ -0,0 +1,74 @@
 # Copyright 2018 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Wrapper control suite environments that adds Gaussian noise to actions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import dm_env
 import numpy as np
 _BOUNDS_MUST_BE_FINITE = (
    'All bounds in `env.action_spec()` must be finite, got: {action_spec}')
 class Wrapper(dm_env.Environment):
  """Wraps a control environment and adds Gaussian noise to actions."""
  def __init__(self, env, scale=0.01):
    """Initializes a new action noise Wrapper.
    Args:
      env: The control suite environment to wrap.
      scale: The standard deviation of the noise, expressed as a fraction
        of the max-min range for each action dimension.
    Raises:
      ValueError: If any of the action dimensions of the wrapped environment are
        unbounded.
    """
    action_spec = env.action_spec()
    if not (np.all(np.isfinite(action_spec.minimum)) and
            np.all(np.isfinite(action_spec.maximum))):
      raise ValueError(_BOUNDS_MUST_BE_FINITE.format(action_spec=action_spec))
    self._minimum = action_spec.minimum
    self._maximum = action_spec.maximum
    self._noise_std = scale * (action_spec.maximum - action_spec.minimum)
    self._env = env
  def step(self, action):
    noisy_action = action + self._env.task.random.normal(scale=self._noise_std)
    # Clip the noisy actions in place so that they fall within the bounds
    # specified by the `action_spec`. Note that MuJoCo implicitly clips out-of-
    # bounds control inputs, but we also clip here in case the actions do not
    # correspond directly to MuJoCo actuators, or if there are other wrapper
    # layers that expect the actions to be within bounds.
    np.clip(noisy_action, self._minimum, self._maximum, out=noisy_action)
    return self._env.step(noisy_action)
  def reset(self):
    return self._env.reset()
  def observation_spec(self):
    return self._env.observation_spec()
  def action_spec(self):
    return self._env.action_spec()
  def __getattr__(self, name):
    return getattr(self._env, name)
--- a/Dreamer/local_dm_control_suite/wrappers/action_noise_test.py
+++ b/Dreamer/local_dm_control_suite/wrappers/action_noise_test.py
@ -0,0 +1,136 @@
 # Copyright 2018 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for the action noise wrapper."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 # Internal dependencies.
 from absl.testing import absltest
 from absl.testing import parameterized
 from dm_control.rl import control
 from dm_control.suite.wrappers import action_noise
 from dm_env import specs
 import mock
 import numpy as np
 class ActionNoiseTest(parameterized.TestCase):
  def make_action_spec(self, lower=(-1.,), upper=(1.,)):
    lower, upper = np.broadcast_arrays(lower, upper)
    return specs.BoundedArray(
        shape=lower.shape, dtype=float, minimum=lower, maximum=upper)
  def make_mock_env(self, action_spec=None):
    action_spec = action_spec or self.make_action_spec()
    env = mock.Mock(spec=control.Environment)
    env.action_spec.return_value = action_spec
    return env
  def assertStepCalledOnceWithCorrectAction(self, env, expected_action):
    # NB: `assert_called_once_with()` doesn't support numpy arrays.
    env.step.assert_called_once()
    actual_action = env.step.call_args_list[0][0][0]
    np.testing.assert_array_equal(expected_action, actual_action)
  @parameterized.parameters([
      dict(lower=np.r_[-1., 0.], upper=np.r_[1., 2.], scale=0.05),
      dict(lower=np.r_[-1., 0.], upper=np.r_[1., 2.], scale=0.),
      dict(lower=np.r_[-1., 0.], upper=np.r_[-1., 0.], scale=0.05),
  ])
  def test_step(self, lower, upper, scale):
    seed = 0
    std = scale * (upper - lower)
    expected_noise = np.random.RandomState(seed).normal(scale=std)
    action = np.random.RandomState(seed).uniform(lower, upper)
    expected_noisy_action = np.clip(action + expected_noise, lower, upper)
    task = mock.Mock(spec=control.Task)
    task.random = np.random.RandomState(seed)
    action_spec = self.make_action_spec(lower=lower, upper=upper)
    env = self.make_mock_env(action_spec=action_spec)
    env.task = task
    wrapped_env = action_noise.Wrapper(env, scale=scale)
    time_step = wrapped_env.step(action)
    self.assertStepCalledOnceWithCorrectAction(env, expected_noisy_action)
    self.assertIs(time_step, env.step(expected_noisy_action))
  @parameterized.named_parameters([
      dict(testcase_name='within_bounds', action=np.r_[-1.], noise=np.r_[0.1]),
      dict(testcase_name='below_lower', action=np.r_[-1.], noise=np.r_[-0.1]),
      dict(testcase_name='above_upper', action=np.r_[1.], noise=np.r_[0.1]),
  ])
  def test_action_clipping(self, action, noise):
    lower = -1.
    upper = 1.
    expected_noisy_action = np.clip(action + noise, lower, upper)
    task = mock.Mock(spec=control.Task)
    task.random = mock.Mock(spec=np.random.RandomState)
    task.random.normal.return_value = noise
    action_spec = self.make_action_spec(lower=lower, upper=upper)
    env = self.make_mock_env(action_spec=action_spec)
    env.task = task
    wrapped_env = action_noise.Wrapper(env)
    time_step = wrapped_env.step(action)
    self.assertStepCalledOnceWithCorrectAction(env, expected_noisy_action)
    self.assertIs(time_step, env.step(expected_noisy_action))
  @parameterized.parameters([
      dict(lower=np.r_[-1., 0.], upper=np.r_[1., np.inf]),
      dict(lower=np.r_[np.nan, 0.], upper=np.r_[1., 2.]),
  ])
  def test_error_if_action_bounds_non_finite(self, lower, upper):
    action_spec = self.make_action_spec(lower=lower, upper=upper)
    env = self.make_mock_env(action_spec=action_spec)
    with self.assertRaisesWithLiteralMatch(
        ValueError,
        action_noise._BOUNDS_MUST_BE_FINITE.format(action_spec=action_spec)):
      _ = action_noise.Wrapper(env)
  def test_reset(self):
    env = self.make_mock_env()
    wrapped_env = action_noise.Wrapper(env)
    time_step = wrapped_env.reset()
    env.reset.assert_called_once_with()
    self.assertIs(time_step, env.reset())
  def test_observation_spec(self):
    env = self.make_mock_env()
    wrapped_env = action_noise.Wrapper(env)
    observation_spec = wrapped_env.observation_spec()
    env.observation_spec.assert_called_once_with()
    self.assertIs(observation_spec, env.observation_spec())
  def test_action_spec(self):
    env = self.make_mock_env()
    wrapped_env = action_noise.Wrapper(env)
    # `env.action_spec()` is called in `Wrapper.__init__()`
    env.action_spec.reset_mock()
    action_spec = wrapped_env.action_spec()
    env.action_spec.assert_called_once_with()
    self.assertIs(action_spec, env.action_spec())
  @parameterized.parameters(['task', 'physics', 'control_timestep'])
  def test_getattr(self, attribute_name):
    env = self.make_mock_env()
    wrapped_env = action_noise.Wrapper(env)
    attr = getattr(wrapped_env, attribute_name)
    self.assertIs(attr, getattr(env, attribute_name))
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/local_dm_control_suite/wrappers/pixels.py
+++ b/Dreamer/local_dm_control_suite/wrappers/pixels.py
@ -0,0 +1,120 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Wrapper that adds pixel observations to a control environment."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 import dm_env
 from dm_env import specs
 STATE_KEY = 'state'
 class Wrapper(dm_env.Environment):
  """Wraps a control environment and adds a rendered pixel observation."""
  def __init__(self, env, pixels_only=True, render_kwargs=None,
               observation_key='pixels'):
    """Initializes a new pixel Wrapper.
    Args:
      env: The environment to wrap.
      pixels_only: If True (default), the original set of 'state' observations
        returned by the wrapped environment will be discarded, and the
        `OrderedDict` of observations will only contain pixels. If False, the
        `OrderedDict` will contain the original observations as well as the
        pixel observations.
      render_kwargs: Optional `dict` containing keyword arguments passed to the
        `mujoco.Physics.render` method.
      observation_key: Optional custom string specifying the pixel observation's
        key in the `OrderedDict` of observations. Defaults to 'pixels'.
    Raises:
      ValueError: If `env`'s observation spec is not compatible with the
        wrapper. Supported formats are a single array, or a dict of arrays.
      ValueError: If `env`'s observation already contains the specified
        `observation_key`.
    """
    if render_kwargs is None:
      render_kwargs = {}
    wrapped_observation_spec = env.observation_spec()
    if isinstance(wrapped_observation_spec, specs.Array):
      self._observation_is_dict = False
      invalid_keys = set([STATE_KEY])
    elif isinstance(wrapped_observation_spec, collections.MutableMapping):
      self._observation_is_dict = True
      invalid_keys = set(wrapped_observation_spec.keys())
    else:
      raise ValueError('Unsupported observation spec structure.')
    if not pixels_only and observation_key in invalid_keys:
      raise ValueError('Duplicate or reserved observation key {!r}.'
                       .format(observation_key))
    if pixels_only:
      self._observation_spec = collections.OrderedDict()
    elif self._observation_is_dict:
      self._observation_spec = wrapped_observation_spec.copy()
    else:
      self._observation_spec = collections.OrderedDict()
      self._observation_spec[STATE_KEY] = wrapped_observation_spec
    # Extend observation spec.
    pixels = env.physics.render(**render_kwargs)
    pixels_spec = specs.Array(
        shape=pixels.shape, dtype=pixels.dtype, name=observation_key)
    self._observation_spec[observation_key] = pixels_spec
    self._env = env
    self._pixels_only = pixels_only
    self._render_kwargs = render_kwargs
    self._observation_key = observation_key
  def reset(self):
    time_step = self._env.reset()
    return self._add_pixel_observation(time_step)
  def step(self, action):
    time_step = self._env.step(action)
    return self._add_pixel_observation(time_step)
  def observation_spec(self):
    return self._observation_spec
  def action_spec(self):
    return self._env.action_spec()
  def _add_pixel_observation(self, time_step):
    if self._pixels_only:
      observation = collections.OrderedDict()
    elif self._observation_is_dict:
      observation = type(time_step.observation)(time_step.observation)
    else:
      observation = collections.OrderedDict()
      observation[STATE_KEY] = time_step.observation
    pixels = self._env.physics.render(**self._render_kwargs)
    observation[self._observation_key] = pixels
    return time_step._replace(observation=observation)
  def __getattr__(self, name):
    return getattr(self._env, name)
--- a/Dreamer/local_dm_control_suite/wrappers/pixels_test.py
+++ b/Dreamer/local_dm_control_suite/wrappers/pixels_test.py
@ -0,0 +1,133 @@
 # Copyright 2017 The dm_control Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Tests for the pixel wrapper."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 # Internal dependencies.
 from absl.testing import absltest
 from absl.testing import parameterized
 from local_dm_control_suite import cartpole
 from dm_control.suite.wrappers import pixels
 import dm_env
 from dm_env import specs
 import numpy as np
 class FakePhysics(object):
  def render(self, *args, **kwargs):
    del args
    del kwargs
    return np.zeros((4, 5, 3), dtype=np.uint8)
 class FakeArrayObservationEnvironment(dm_env.Environment):
  def __init__(self):
    self.physics = FakePhysics()
  def reset(self):
    return dm_env.restart(np.zeros((2,)))
  def step(self, action):
    del action
    return dm_env.transition(0.0, np.zeros((2,)))
  def action_spec(self):
    pass
  def observation_spec(self):
    return specs.Array(shape=(2,), dtype=np.float)
 class PixelsTest(parameterized.TestCase):
  @parameterized.parameters(True, False)
  def test_dict_observation(self, pixels_only):
    pixel_key = 'rgb'
    env = cartpole.swingup()
    # Make sure we are testing the right environment for the test.
    observation_spec = env.observation_spec()
    self.assertIsInstance(observation_spec, collections.OrderedDict)
    width = 320
    height = 240
    # The wrapper should only add one observation.
    wrapped = pixels.Wrapper(env,
                             observation_key=pixel_key,
                             pixels_only=pixels_only,
                             render_kwargs={'width': width, 'height': height})
    wrapped_observation_spec = wrapped.observation_spec()
    self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict)
    if pixels_only:
      self.assertLen(wrapped_observation_spec, 1)
      self.assertEqual([pixel_key], list(wrapped_observation_spec.keys()))
    else:
      expected_length = len(observation_spec) + 1
      self.assertLen(wrapped_observation_spec, expected_length)
      expected_keys = list(observation_spec.keys()) + [pixel_key]
      self.assertEqual(expected_keys, list(wrapped_observation_spec.keys()))
    # Check that the added spec item is consistent with the added observation.
    time_step = wrapped.reset()
    rgb_observation = time_step.observation[pixel_key]
    wrapped_observation_spec[pixel_key].validate(rgb_observation)
    self.assertEqual(rgb_observation.shape, (height, width, 3))
    self.assertEqual(rgb_observation.dtype, np.uint8)
  @parameterized.parameters(True, False)
  def test_single_array_observation(self, pixels_only):
    pixel_key = 'depth'
    env = FakeArrayObservationEnvironment()
    observation_spec = env.observation_spec()
    self.assertIsInstance(observation_spec, specs.Array)
    wrapped = pixels.Wrapper(env, observation_key=pixel_key,
                             pixels_only=pixels_only)
    wrapped_observation_spec = wrapped.observation_spec()
    self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict)
    if pixels_only:
      self.assertLen(wrapped_observation_spec, 1)
      self.assertEqual([pixel_key], list(wrapped_observation_spec.keys()))
    else:
      self.assertLen(wrapped_observation_spec, 2)
      self.assertEqual([pixels.STATE_KEY, pixel_key],
                       list(wrapped_observation_spec.keys()))
    time_step = wrapped.reset()
    depth_observation = time_step.observation[pixel_key]
    wrapped_observation_spec[pixel_key].validate(depth_observation)
    self.assertEqual(depth_observation.shape, (4, 5, 3))
    self.assertEqual(depth_observation.dtype, np.uint8)
 if __name__ == '__main__':
  absltest.main()
--- a/Dreamer/models.py
+++ b/Dreamer/models.py
@ -0,0 +1,297 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras import layers as tfkl
 from tensorflow_probability import distributions as tfd
 from tensorflow.keras.mixed_precision import experimental as prec
 import tools
 class RSSM(tools.Module):
    def __init__(self, stoch=30, deter=200, hidden=200, act=tf.nn.elu):
        super().__init__()
        self._activation = act
        self._stoch_size = stoch
        self._deter_size = deter
        self._hidden_size = hidden
        self._cell = tfkl.GRUCell(self._deter_size)
    def initial(self, batch_size):
        dtype = prec.global_policy().compute_dtype
        return dict(
            mean=tf.zeros([batch_size, self._stoch_size], dtype),
            std=tf.zeros([batch_size, self._stoch_size], dtype),
            stoch=tf.zeros([batch_size, self._stoch_size], dtype),
            deter=self._cell.get_initial_state(None, batch_size, dtype))
    @tf.function
    def observe(self, embed, action, state=None):
        if state is None:
            state = self.initial(tf.shape(action)[0])
        embed = tf.transpose(embed, [1, 0, 2])
        action = tf.transpose(action, [1, 0, 2])
        post, prior = tools.static_scan(
            lambda prev, inputs: self.obs_step(
                prev[0], *inputs), 
            (action, embed), (state, state))
        post = {k: tf.transpose(v, [1, 0, 2]) for k, v in post.items()}
        prior = {k: tf.transpose(v, [1, 0, 2]) for k, v in prior.items()}
        return post, prior
    @tf.function
    def imagine(self, action, state=None):
        if state is None:
            state = self.initial(tf.shape(action)[0])
        assert isinstance(state, dict), state
        action = tf.transpose(action, [1, 0, 2])
        prior = tools.static_scan(self.img_step, action, state)
        prior = {k: tf.transpose(v, [1, 0, 2]) for k, v in prior.items()}
        return prior
    def get_feat(self, state):
        return tf.concat([state['stoch'], state['deter']], -1)
    def get_dist(self, state):
        return tfd.MultivariateNormalDiag(state['mean'], state['std'])
    @tf.function
    def obs_step(self, prev_state, prev_action, embed):
        prior = self.img_step(prev_state, prev_action)
        x = tf.concat([prior['deter'], embed], -1)
        x = self.get('obs1', tfkl.Dense, self._hidden_size,
                     self._activation)(x)
        x = self.get('obs2', tfkl.Dense, 2 * self._stoch_size, None)(x)
        mean, std = tf.split(x, 2, -1)
        std = tf.nn.softplus(std) + 0.1
        stoch = self.get_dist({'mean': mean, 'std': std}).sample()
        post = {'mean': mean, 'std': std,
                'stoch': stoch, 'deter': prior['deter']}
        return post, prior
    @tf.function
    def img_step(self, prev_state, prev_action):
        x = tf.concat([prev_state['stoch'], prev_action], -1)
        x = self.get('img1', tfkl.Dense, self._hidden_size,
                     self._activation)(x)
        x, deter = self._cell(x, [prev_state['deter']])
        deter = deter[0]  # Keras wraps the state in a list.
        x = self.get('img2', tfkl.Dense, self._hidden_size,
                     self._activation)(x)
        x = self.get('img3', tfkl.Dense, 2 * self._stoch_size, None)(x)
        mean, std = tf.split(x, 2, -1)
        std = tf.nn.softplus(std) + 0.1
        stoch = self.get_dist({'mean': mean, 'std': std}).sample()
        prior = {'mean': mean, 'std': std, 'stoch': stoch, 'deter': deter}
        return prior
 class ConvEncoder(tools.Module):
    def __init__(self, depth=32, act=tf.nn.relu, image_size=64):
        self._act = act
        self._depth = depth
        self._image_size = image_size
        if image_size == 64:
            self._outdim = 32 * self._depth
            self._kernel_sizes = [4, 4, 4, 4]
        elif image_size == 32:
            self._outdim = 8 * self._depth
            self._kernel_sizes = [3, 3, 3, 3]
        elif image_size == 84:
            self._outdim = 72 * self._depth
            self._kernel_sizes = [4, 4, 4, 4]
        else:
            raise NotImplementedError
    def __call__(self, obs):
        kwargs = dict(strides=2, activation=self._act)
        x = tf.reshape(obs['image'], (-1,) + tuple(obs['image'].shape[-3:]))
        x = self.get('h1', tfkl.Conv2D, 1 * self._depth,
                     self._kernel_sizes[0], **kwargs)(x)
        x = self.get('h2', tfkl.Conv2D, 2 * self._depth,
                     self._kernel_sizes[1], **kwargs)(x)
        x = self.get('h3', tfkl.Conv2D, 4 * self._depth,
                     self._kernel_sizes[2], **kwargs)(x)
        x = self.get('h4', tfkl.Conv2D, 8 * self._depth,
                     self._kernel_sizes[3], **kwargs)(x)
        shape = tf.concat([tf.shape(obs['image'])[:-3], [self._outdim]], 0)
        return tf.reshape(x, shape)
 class ConvDecoder(tools.Module):
    def __init__(self, depth=32, act=tf.nn.relu, shape=(64, 64, 3)):
        self._act = act
        self._depth = depth
        self._shape = shape
        if shape[0] == 64:
            self._outdim = 32 * self._depth
            self._kernel_sizes = [5, 5, 6, 6]
        elif shape[0] == 32:
            self._outdim = 8 * self._depth
            self._kernel_sizes = [3, 3, 3, 4]
        elif shape[0] == 84:
            self._outdim = 72 * self._depth
            self._kernel_sizes = [7, 6, 6, 6]
        else:
            raise NotImplementedError
    def __call__(self, features):
        kwargs = dict(strides=2, activation=self._act)
        x = self.get('h1', tfkl.Dense, self._outdim, None)(features)
        x = tf.reshape(x, [-1, 1, 1, self._outdim])
        x = self.get('h2', tfkl.Conv2DTranspose,
                     4 * self._depth, self._kernel_sizes[0], **kwargs)(x)
        x = self.get('h3', tfkl.Conv2DTranspose,
                     2 * self._depth, self._kernel_sizes[1], **kwargs)(x)
        x = self.get('h4', tfkl.Conv2DTranspose,
                     1 * self._depth, self._kernel_sizes[2], **kwargs)(x)
        x = self.get('h5', tfkl.Conv2DTranspose,
                     self._shape[-1], self._kernel_sizes[3], strides=2)(x)
        mean = tf.reshape(x, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape))
 class ConvDecoderMask(tools.Module):
    def __init__(self, depth=32, act=tf.nn.relu, shape=(64, 64, 3)):
        self._act = act
        self._depth = depth
        self._shape = shape
        if shape[0] == 64:
            self._outdim = 32 * self._depth
            self._kernel_sizes = [5, 5, 6, 6]
        elif shape[0] == 32:
            self._outdim = 8 * self._depth
            self._kernel_sizes = [3, 3, 3, 4]
        elif shape[0] == 84:
            self._outdim = 72 * self._depth
            self._kernel_sizes = [7, 6, 6, 6]
        else:
            raise NotImplementedError
    def __call__(self, features):
        kwargs = dict(strides=2, activation=self._act)
        x = self.get('h1', tfkl.Dense, self._outdim, None)(features)
        x = tf.reshape(x, [-1, 1, 1, self._outdim])
        x = self.get('h2', tfkl.Conv2DTranspose,
                     4 * self._depth, self._kernel_sizes[0], **kwargs)(x)
        x = self.get('h3', tfkl.Conv2DTranspose,
                     2 * self._depth, self._kernel_sizes[1], **kwargs)(x)
        x = self.get('h4', tfkl.Conv2DTranspose,
                     1 * self._depth, self._kernel_sizes[2], **kwargs)(x)
        x = self.get('h5', tfkl.Conv2DTranspose,
                     3 + self._shape[-1], self._kernel_sizes[3], strides=2)(x)
        mean, mask = tf.split(x, [3, 3], -1)
        mean = tf.reshape(mean, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        mask = tf.reshape(mask, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), mask
 class ConvDecoderMaskEnsemble(tools.Module):
    """
    ensemble two convdecoder with <Normal, mask> outputs
    """
    def __init__(self, decoder1, decoder2, precision):
        self._decoder1 = decoder1
        self._decoder2 = decoder2
        self._precision = 'float' + str(precision)
        self._shape = decoder1._shape
    def __call__(self, feat1, feat2):
        kwargs = dict(strides=1, activation=tf.nn.sigmoid)
        pred1, mask1 = self._decoder1(feat1)
        pred2, mask2 = self._decoder2(feat2)
        mean1 = pred1.submodules[0].loc
        mean2 = pred2.submodules[0].loc
        mask_feat = tf.concat([mask1, mask2], -1)
        mask = self.get('mask1', tfkl.Conv2D, 1, 1, **kwargs)(mask_feat)
        mask_use1 = mask
        mask_use2 = 1-mask
        mean = mean1 * tf.cast(mask_use1, self._precision) + \
            mean2 * tf.cast(mask_use2, self._precision)
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), pred1, pred2, tf.cast(mask_use1, self._precision)
 class InverseDecoder(tools.Module):
    def __init__(self, shape, layers, units, act=tf.nn.elu):
        self._shape = shape
        self._layers = layers
        self._units = units
        self._act = act
    def __call__(self, features):
        x = tf.concat([features[:, :-1], features[:, 1:]], -1)
        for index in range(self._layers):
            x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x)
        x = self.get(f'hout', tfkl.Dense, np.prod(self._shape))(x)
        return tfd.Independent(tfd.Normal(x, 1), 1)
 class DenseDecoder(tools.Module):
    def __init__(self, shape, layers, units, dist='normal', act=tf.nn.elu):
        self._shape = shape
        self._layers = layers
        self._units = units
        self._dist = dist
        self._act = act
    def __call__(self, features):
        x = features
        for index in range(self._layers):
            x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x)
        x = self.get(f'hout', tfkl.Dense, np.prod(self._shape))(x)
        x = tf.reshape(x, tf.concat([tf.shape(features)[:-1], self._shape], 0))
        if self._dist == 'normal':
            return tfd.Independent(tfd.Normal(x, 1), len(self._shape))
        if self._dist == 'binary':
            return tfd.Independent(tfd.Bernoulli(x), len(self._shape))
        raise NotImplementedError(self._dist)
 class ActionDecoder(tools.Module):
    def __init__(
            self, size, layers, units, dist='tanh_normal', act=tf.nn.elu,
            min_std=1e-4, init_std=5, mean_scale=5):
        self._size = size
        self._layers = layers
        self._units = units
        self._dist = dist
        self._act = act
        self._min_std = min_std
        self._init_std = init_std
        self._mean_scale = mean_scale
    def __call__(self, features):
        raw_init_std = np.log(np.exp(self._init_std) - 1)
        x = features
        for index in range(self._layers):
            x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x)
        if self._dist == 'tanh_normal':
            # https://www.desmos.com/calculator/rcmcf5jwe7
            x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x)
            mean, std = tf.split(x, 2, -1)
            mean = self._mean_scale * tf.tanh(mean / self._mean_scale)
            std = tf.nn.softplus(std + raw_init_std) + self._min_std
            dist = tfd.Normal(mean, std)
            dist = tfd.TransformedDistribution(dist, tools.TanhBijector())
            dist = tfd.Independent(dist, 1)
            dist = tools.SampleDist(dist)
        elif self._dist == 'onehot':
            x = self.get(f'hout', tfkl.Dense, self._size)(x)
            dist = tools.OneHotDist(x)
        else:
            raise NotImplementedError(dist)
        return dist
--- a/Dreamer/run.py
+++ b/Dreamer/run.py
@ -0,0 +1,121 @@
 import wrappers
 import tools
 import tensorflow as tf
 import argparse
 import functools
 import yaml
 import os
 import pathlib
 import sys
 from tensorflow.keras.mixed_precision import experimental as prec
 from dreamers import Dreamer, SeparationDreamer, InverseDreamer
 from env_tools import count_steps, make_env
 METHOD2DREAMER = {
    'dreamer': Dreamer,
    'tia': SeparationDreamer,
    'inverse': InverseDreamer
 }
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 os.environ['MUJOCO_GL'] = 'egl'
 tf.get_logger().setLevel('ERROR')
 sys.path.append(str(pathlib.Path(__file__).parent))
 def main(method, config):
    if method == 'separation':
        config.logdir = os.path.join(
            config.logdir, config.task,
            'separation' + '_' + str(config.disen_neg_rew_scale) +
            '_' + str(config.disen_rec_scale),
            str(config.seed))
    else:
        config.logdir = os.path.join(
            config.logdir, config.task,
            method,
            str(config.seed))
    logdir = pathlib.Path(config.logdir)
    logdir.mkdir(parents=True, exist_ok=True)
    snapshot_dir = os.path.join(config.logdir, 'snapshots')
    snapshot_dir = pathlib.Path(snapshot_dir)
    snapshot_dir.mkdir(parents=True, exist_ok=True)
    with open(os.path.join(config.logdir, 'config.yaml'), 'w') as f:
        yaml.dump(vars(config), f, sort_keys=False)
    if config.gpu_growth:
        for gpu in tf.config.experimental.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)
    assert config.precision in (16, 32), config.precision
    if config.precision == 16:
        prec.set_policy(prec.Policy('mixed_float16'))
    config.steps = int(config.steps)
    config.logdir = logdir
    print('Logdir', config.logdir)
    # Create environments.
    datadir = config.logdir / 'episodes'
    writer = tf.summary.create_file_writer(
        str(config.logdir), max_queue=1000, flush_millis=20000)
    writer.set_as_default()
    train_envs = [wrappers.Async(lambda: make_env(
        config, writer, 'train', datadir, config.video_dir, store=True), config.parallel)
        for _ in range(config.envs)]
    test_envs = [wrappers.Async(lambda: make_env(
        config, writer, 'test', datadir, config.video_dir, store=False), config.parallel)
        for _ in range(config.envs)]
    actspace = train_envs[0].action_space
    # Prefill dataset with random episodes.
    step = count_steps(datadir, config)
    prefill = max(0, config.prefill - step)
    print(f'Prefill dataset with {prefill} steps.')
    def random_agent(o, d, _): return ([actspace.sample() for _ in d], None)
    tools.simulate(random_agent, train_envs, prefill / config.action_repeat)
    writer.flush()
    # Train and regularly evaluate the agent.
    step = count_steps(datadir, config)
    print(f'Simulating agent for {config.steps-step} steps.')
    DreamerModel = METHOD2DREAMER[method]
    agent = DreamerModel(config, datadir, actspace, writer)
    if (config.logdir / 'variables.pkl').exists():
        print('Load checkpoint.')
        agent.load(config.logdir / 'variables.pkl')
    state = None
    should_snapshot = tools.Every(config.snapshot_every)
    while step < config.steps:
        print('Start evaluation.')
        tools.simulate(
            functools.partial(agent, training=False), test_envs, episodes=1)
        writer.flush()
        print('Start collection.')
        steps = config.eval_every // config.action_repeat
        state = tools.simulate(agent, train_envs, steps, state=state)
        step = count_steps(datadir, config)
        agent.save(config.logdir / 'variables.pkl')
        if should_snapshot(step):
            agent.save(snapshot_dir / ('variables_' + str(step) + '.pkl'))
    for env in train_envs + test_envs:
        env.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--method', type=str, choices=['dreamer', 'inverse', 'tia'], required=True)
    parser.add_argument('--configs', nargs='+', required=True)
    args, remaining = parser.parse_known_args()
    config_path = 'train_configs/' + args.method + '.yaml'
    configs = yaml.safe_load(
        (pathlib.Path(__file__).parent / config_path).read_text())
    config_ = {}
    for name in args.configs:
        config_.update(configs[name])
    parser = argparse.ArgumentParser()
    for key, value in config_.items():
        parser.add_argument(
            f'--{key}', type=tools.args_type(value), default=value)
    main(args.method, parser.parse_args(remaining))
--- a/Dreamer/tools.py
+++ b/Dreamer/tools.py
@ -0,0 +1,467 @@
 import datetime
 import io
 import pathlib
 import pickle
 import re
 import uuid
 import gym
 import numpy as np
 import tensorflow as tf
 import tensorflow.compat.v1 as tf1  # pylint: disable=E
 import tensorflow_probability as tfp
 from tensorflow.keras.mixed_precision import experimental as prec
 from tensorflow_probability import distributions as tfd
 class AttrDict(dict):
    __setattr__ = dict.__setitem__
    __getattr__ = dict.__getitem__
    def from_dict(self, src_dict):
        for key in src_dict:
            setattr(self, key, src_dict[key])
 class Module(tf.Module):
    def save(self, filename):
        values = tf.nest.map_structure(lambda x: x.numpy(), self.variables)
        with pathlib.Path(filename).open('wb') as f:
            pickle.dump(values, f)
    def load(self, filename):
        with pathlib.Path(filename).open('rb') as f:
            values = pickle.load(f)
        tf.nest.map_structure(lambda x, y: x.assign(y), self.variables, values)
    def get(self, name, ctor, *args, **kwargs):
        # Create or get layer by name to avoid mentioning it in the constructor.
        if not hasattr(self, '_modules'):
            self._modules = {}
        if name not in self._modules:
            self._modules[name] = ctor(*args, **kwargs)
        return self._modules[name]
 def nest_summary(structure):
    if isinstance(structure, dict):
        return {k: nest_summary(v) for k, v in structure.items()}
    if isinstance(structure, list):
        return [nest_summary(v) for v in structure]
    if hasattr(structure, 'shape'):
        return str(structure.shape).replace(', ', 'x').strip('(), ')
    return '?'
 def graph_summary(writer, fn, step, *args):
    def inner(*args):
        tf.summary.experimental.set_step(step)
        with writer.as_default():
            fn(*args)
    return tf.numpy_function(inner, args, [])
 def video_summary(name, video, step=None, fps=20):
    if isinstance(name, type(np.zeros(1))):
        name = str(name)
    else:
        name = name if isinstance(name, str) else name.decode('utf-8')
    if np.issubdtype(video.dtype, np.floating):
        video = np.clip(255 * video, 0, 255).astype(np.uint8)
    B, T, H, W, C = video.shape
    try:
        frames = video.transpose((1, 2, 0, 3, 4)).reshape((T, H, B * W, C))
        summary = tf1.Summary()
        image = tf1.Summary.Image(height=B * H, width=T * W, colorspace=C)
        image.encoded_image_string = encode_gif(frames, fps)
        summary.value.add(tag=name + '/gif', image=image)
        tf.summary.experimental.write_raw_pb(summary.SerializeToString(), step)
    except (IOError, OSError) as e:
        print('GIF summaries require ffmpeg in $PATH.', e)
        frames = video.transpose((0, 2, 1, 3, 4)).reshape((1, B * H, T * W, C))
        tf.summary.image(name + '/grid', frames, step)
 def encode_gif(frames, fps):
    from subprocess import Popen, PIPE
    h, w, c = frames[0].shape
    pxfmt = {1: 'gray', 3: 'rgb24'}[c]
    cmd = ' '.join([
        f'ffmpeg -y -f rawvideo -vcodec rawvideo',
        f'-r {fps:.02f} -s {w}x{h} -pix_fmt {pxfmt} -i - -filter_complex',
        f'[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse',
        f'-r {fps:.02f} -f gif -'])
    proc = Popen(cmd.split(' '), stdin=PIPE, stdout=PIPE, stderr=PIPE)
    for image in frames:
        proc.stdin.write(image.tostring())
    out, err = proc.communicate()
    if proc.returncode:
        raise IOError('\n'.join([' '.join(cmd), err.decode('utf8')]))
    del proc
    return out
 def simulate(agent, envs, steps=0, episodes=0, state=None):
    # Initialize or unpack simulation state.
    if state is None:
        step, episode = 0, 0
        done = np.ones(len(envs), np.bool)
        length = np.zeros(len(envs), np.int32)
        obs = [None] * len(envs)
        agent_state = None
    else:
        step, episode, done, length, obs, agent_state = state
    while (steps and step < steps) or (episodes and episode < episodes):
        # Reset envs if necessary.
        if done.any():
            indices = [index for index, d in enumerate(done) if d]
            promises = [envs[i].reset(blocking=False) for i in indices]
            for index, promise in zip(indices, promises):
                obs[index] = promise()
        # Step agents.
        # if use augmentation, need to modify dreamer.policy or here. 
        obs = {k: np.stack([o[k] for o in obs]) for k in obs[0]}
        action, agent_state = agent(obs, done, agent_state)
        action = np.array(action)
        assert len(action) == len(envs)
        # Step envs.
        promises = [e.step(a, blocking=False) for e, a in zip(envs, action)]
        obs, _, done = zip(*[p()[:3] for p in promises])
        obs = list(obs)
        done = np.stack(done)
        episode += int(done.sum())
        length += 1
        step += (done * length).sum()
        length *= (1 - done)
    # Return new state to allow resuming the simulation.
    return (step - steps, episode - episodes, done, length, obs, agent_state)
 def count_episodes(directory):
    filenames = directory.glob('*.npz')
    lengths = [int(n.stem.rsplit('-', 1)[-1]) - 1 for n in filenames]
    episodes, steps = len(lengths), sum(lengths)
    return episodes, steps
 def save_episodes(directory, episodes):
    directory = pathlib.Path(directory).expanduser()
    directory.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
    for episode in episodes:
        identifier = str(uuid.uuid4().hex)
        length = len(episode['reward'])
        filename = directory / f'{timestamp}-{identifier}-{length}.npz'
        with io.BytesIO() as f1:
            np.savez_compressed(f1, **episode)
            f1.seek(0)
            with filename.open('wb') as f2:
                f2.write(f1.read())
 def load_episodes(directory, rescan, length=None, balance=False, seed=0):
    directory = pathlib.Path(directory).expanduser()
    random = np.random.RandomState(seed)
    cache = {}
    while True:
        for filename in directory.glob('*.npz'):
            if filename not in cache:
                try:
                    with filename.open('rb') as f:
                        episode = np.load(f)
                        episode = {k: episode[k] for k in episode.keys()}
                except Exception as e:
                    print(f'Could not load episode: {e}')
                    continue
                cache[filename] = episode
        keys = list(cache.keys())
        for index in random.choice(len(keys), rescan):
            episode = cache[keys[index]]
            if length:
                total = len(next(iter(episode.values())))
                available = total - length
                if available < 1:
                    print(f'Skipped short episode of length {available}({total}/{length}).')
                    continue
                if balance:
                    index = min(random.randint(0, total), available)
                else:
                    index = int(random.randint(0, available))
                episode = {k: v[index: index + length]
                           for k, v in episode.items()}
            yield episode
 class DummyEnv:
    def __init__(self):
        self._random = np.random.RandomState(seed=0)
        self._step = None
    @property
    def observation_space(self):
        low = np.zeros([64, 64, 3], dtype=np.uint8)
        high = 255 * np.ones([64, 64, 3], dtype=np.uint8)
        spaces = {'image': gym.spaces.Box(low, high)}
        return gym.spaces.Dict(spaces)
    @property
    def action_space(self):
        low = -np.ones([5], dtype=np.float32)
        high = np.ones([5], dtype=np.float32)
        return gym.spaces.Box(low, high)
    def reset(self):
        self._step = 0
        obs = self.observation_space.sample()
        return obs
    def step(self, action):
        obs = self.observation_space.sample()
        reward = self._random.uniform(0, 1)
        self._step += 1
        done = self._step >= 1000
        info = {}
        return obs, reward, done, info
 class SampleDist:
    def __init__(self, dist, samples=100):
        self._dist = dist
        self._samples = samples
    @property
    def name(self):
        return 'SampleDist'
    def __getattr__(self, name):
        return getattr(self._dist, name)
    def mean(self):
        samples = self._dist.sample(self._samples)
        return tf.reduce_mean(samples, 0)
    def mode(self):
        sample = self._dist.sample(self._samples)
        logprob = self._dist.log_prob(sample)
        return tf.gather(sample, tf.argmax(logprob))[0]  # pylint: disable=E
    def entropy(self):
        sample = self._dist.sample(self._samples)
        logprob = self.log_prob(sample)
        return -tf.reduce_mean(logprob, 0)
 class OneHotDist:
    def __init__(self, logits=None, probs=None):
        self._dist = tfd.Categorical(logits=logits, probs=probs)
        self._num_classes = self.mean().shape[-1]
        self._dtype = prec.global_policy().compute_dtype
    @property
    def name(self):
        return 'OneHotDist'
    def __getattr__(self, name):
        return getattr(self._dist, name)
    def prob(self, events):
        indices = tf.argmax(events, axis=-1)
        return self._dist.prob(indices)
    def log_prob(self, events):
        indices = tf.argmax(events, axis=-1)
        return self._dist.log_prob(indices)
    def mean(self):
        return self._dist.probs_parameter()
    def mode(self):
        return self._one_hot(self._dist.mode())
    def sample(self, amount=None):
        amount = [amount] if amount else []
        indices = self._dist.sample(*amount)
        sample = self._one_hot(indices)
        probs = self._dist.probs_parameter()
        sample += tf.cast(probs - tf.stop_gradient(probs), self._dtype)
        return sample
    def _one_hot(self, indices):
        return tf.one_hot(indices, self._num_classes, dtype=self._dtype)  # pylint: disable=E
 class TanhBijector(tfp.bijectors.Bijector):
    def __init__(self, validate_args=False, name='tanh'):
        super().__init__(
            forward_min_event_ndims=0,
            validate_args=validate_args,
            name=name)
    def _forward(self, x):
        return tf.nn.tanh(x)
    def _inverse(self, y):
        dtype = y.dtype
        y = tf.cast(y, tf.float32)
        y = tf.where(
            tf.less_equal(tf.abs(y), 1.),
            tf.clip_by_value(y, -0.99999997, 0.99999997), y)
        y = tf.atanh(y)
        y = tf.cast(y, dtype)
        return y
    def _forward_log_det_jacobian(self, x):
        log2 = tf.math.log(tf.constant(2.0, dtype=x.dtype))
        return 2.0 * (log2 - x - tf.nn.softplus(-2.0 * x))
 def lambda_return(
        reward, value, pcont, bootstrap, lambda_, axis):
    # Setting lambda=1 gives a discounted Monte Carlo return.
    # Setting lambda=0 gives a fixed 1-step return.
    assert reward.shape.ndims == value.shape.ndims, (reward.shape, value.shape)
    if isinstance(pcont, (int, float)):
        pcont = pcont * tf.ones_like(reward)
    dims = list(range(reward.shape.ndims))
    dims = [axis] + dims[1:axis] + [0] + dims[axis + 1:]
    if axis != 0:
        reward = tf.transpose(reward, dims)
        value = tf.transpose(value, dims)
        pcont = tf.transpose(pcont, dims)
    if bootstrap is None:
        bootstrap = tf.zeros_like(value[-1])
    next_values = tf.concat([value[1:], bootstrap[None]], 0)
    inputs = reward + pcont * next_values * (1 - lambda_)
    returns = static_scan(
        lambda agg, cur: cur[0] + cur[1] * lambda_ * agg,
        (inputs, pcont), bootstrap, reverse=True)
    if axis != 0:
        returns = tf.transpose(returns, dims)
    return returns
 class Adam(tf.Module):
    def __init__(self, name, modules, lr, clip=None, wd=None, wdpattern=r'.*'):
        self._name = name
        self._modules = modules
        self._clip = clip
        self._wd = wd
        self._wdpattern = wdpattern
        self._opt = tf.optimizers.Adam(lr)
        self._opt = prec.LossScaleOptimizer(self._opt, 'dynamic')
        self._variables = None
    @property
    def variables(self):
        return self._opt.variables()
    def __call__(self, tape, loss):
        if self._variables is None:
            variables = [module.variables for module in self._modules]
            self._variables = tf.nest.flatten(variables)
            count = sum(np.prod(x.shape) for x in self._variables)
            print(f'Found {count} {self._name} parameters.')
        assert len(loss.shape) == 0, loss.shape
        with tape:
            loss = self._opt.get_scaled_loss(loss)
        grads = tape.gradient(loss, self._variables)
        grads = self._opt.get_unscaled_gradients(grads)
        norm = tf.linalg.global_norm(grads)
        if self._clip:
            grads, _ = tf.clip_by_global_norm(grads, self._clip, norm)
        if self._wd:
            context = tf.distribute.get_replica_context()
            context.merge_call(self._apply_weight_decay)
        self._opt.apply_gradients(zip(grads, self._variables))
        return norm
    def _apply_weight_decay(self, strategy):
        print('Applied weight decay to variables:')
        for var in self._variables:
            if re.search(self._wdpattern, self._name + '/' + var.name):
                print('- ' + self._name + '/' + var.name)
                strategy.extended.update(var, lambda var: self._wd * var)
 def args_type(default):
    if isinstance(default, bool):
        return lambda x: bool(['False', 'True'].index(x))
    if isinstance(default, int):
        return lambda x: float(x) if ('e' in x or '.' in x) else int(x)
    if isinstance(default, pathlib.Path):
        return lambda x: pathlib.Path(x).expanduser()
    return type(default)
 def static_scan(fn, inputs, start, reverse=False):
    last = start
    outputs = [[] for _ in tf.nest.flatten(start)]
    indices = range(len(tf.nest.flatten(inputs)[0]))
    if reverse:
        indices = reversed(indices)
    for index in indices:
        inp = tf.nest.map_structure(lambda x: x[index], inputs)
        last = fn(last, inp)
        [o.append(l) for o, l in zip(outputs, tf.nest.flatten(last))]
    if reverse:
        outputs = [list(reversed(x)) for x in outputs]
    outputs = [tf.stack(x, 0) for x in outputs]
    return tf.nest.pack_sequence_as(start, outputs)
 def _mnd_sample(self, sample_shape=(), seed=None, name='sample'):
    return tf.random.normal(
        tuple(sample_shape) + tuple(self.event_shape),
        self.mean(), self.stddev(), self.dtype, seed, name)
 tfd.MultivariateNormalDiag.sample = _mnd_sample
 def _cat_sample(self, sample_shape=(), seed=None, name='sample'):
    assert len(sample_shape) in (0, 1), sample_shape
    assert len(self.logits_parameter().shape) == 2
    indices = tf.random.categorical(
        self.logits_parameter(), sample_shape[0] if sample_shape else 1,
        self.dtype, seed, name)
    if not sample_shape:
        indices = indices[..., 0]
    return indices
 tfd.Categorical.sample = _cat_sample
 class Every:
    def __init__(self, every):
        self._every = every
        self._last = None
    def __call__(self, step):
        if self._last is None:
            self._last = step
            return True
        if step >= self._last + self._every:
            self._last += self._every
            return True
        return False
 class Once:
    def __init__(self):
        self._once = True
    def __call__(self):
        if self._once:
            self._once = False
            return True
        return False
--- a/Dreamer/train_configs/dreamer.yaml
+++ b/Dreamer/train_configs/dreamer.yaml
@ -0,0 +1,73 @@
 dmc: 
  logdir: ./
  video_dir: ./
  debug: False
  seed: 0
  steps: 1000000.0
  snapshot_every: 20000.0
  eval_every: 5000.0
  log_every: 5000.0
  image_size: 64
  log_scalars: true
  log_images: true 
  gpu_growth: true
  precision: 16
  task: dmc_cheetah_run_driving
  envs: 1
  parallel: none
  action_repeat: 2
  time_limit: 1000
  prefill: 5000
  eval_noise: 0.0
  clip_rewards: none
  deter_size: 283
  stoch_size: 42
  num_units: 400
  dense_act: elu
  cnn_act: relu
  cnn_depth: 45
  pcont: false
  free_nats: 3.0
  kl_scale: 1.0
  pcont_scale: 10.0
  weight_decay: 0.0
  weight_decay_pattern: .*
  batch_size: 50
  batch_length: 50
  train_every: 1000
  train_steps: 100
  pretrain: 100
  model_lr: 0.0006
  value_lr: 8.0e-05
  actor_lr: 8.0e-05
  grad_clip: 100.0
  dataset_balance: false
  discount: 0.99
  disclam: 0.95
  horizon: 15
  action_dist: tanh_normal
  action_init_std: 5.0
  expl: additive_gaussian
  expl_amount: 0.3
  expl_decay: 0.0
  expl_min: 0.0
 debug:
  debug: True
  pretrain: 1
  prefill: 1
  train_steps: 1
  deter_size: 20
  stoch_size: 3
  num_units: 40
  cnn_depth: 3
  batch_size: 10
  batch_length: 20
--- a/Dreamer/train_configs/inverse.yaml
+++ b/Dreamer/train_configs/inverse.yaml
@ -0,0 +1,66 @@
 dmc: 
  logdir: ./
  video_dir: ./
  debug: False
  seed: 0
  steps: 1000000.0
  snapshot_every: 20000.0
  eval_every: 5000.0
  log_every: 5000.0
  image_size: 64
  log_scalars: true
  log_images: false 
  gpu_growth: true
  precision: 16
  task: dmc_cheetah_run_driving
  envs: 1
  parallel: none
  action_repeat: 2
  time_limit: 1000
  prefill: 5000
  eval_noise: 0.0
  clip_rewards: none
  deter_size: 283
  stoch_size: 42
  num_units: 400
  dense_act: elu
  cnn_act: relu
  cnn_depth: 45
  pcont: false
  free_nats: 3.0
  kl_scale: 1.0
  pcont_scale: 10.0
  weight_decay: 0.0
  weight_decay_pattern: .*
  batch_size: 50
  batch_length: 50
  train_every: 1000
  train_steps: 100
  pretrain: 100
  model_lr: 0.0006
  value_lr: 8.0e-05
  actor_lr: 8.0e-05
  grad_clip: 100.0
  dataset_balance: false
  discount: 0.99
  disclam: 0.95
  horizon: 15
  action_dist: tanh_normal
  action_init_std: 5.0
  expl: additive_gaussian
  expl_amount: 0.3
  expl_decay: 0.0
  expl_min: 0.0
 debug:
  debug: True
  pretrain: 1
  prefill: 1
  train_steps: 1
  batch_size: 10
  batch_length: 20
--- a/Dreamer/train_configs/tia.yaml
+++ b/Dreamer/train_configs/tia.yaml
@ -0,0 +1,75 @@
 dmc:
  logdir: ./
  video_dir: ./
  debug: False
  seed: 0
  steps: 1000000.0
  snapshot_every: 20000.0
  eval_every: 5000.0
  log_every: 5000.0
  image_size: 64
  log_scalars: true
  log_images: true
  gpu_growth: true
  precision: 16
  task: dmc_cheetah_run_driving
  envs: 1
  parallel: none
  action_repeat: 2
  time_limit: 1000
  prefill: 5000
  eval_noise: 0.0
  clip_rewards: none
  deter_size: 200
  stoch_size: 30
  disen_deter_size: 200
  disen_stoch_size: 30
  num_units: 400
  dense_act: elu
  cnn_act: relu
  cnn_depth: 26
  disen_cnn_depth: 26
  pcont: false
  free_nats: 3.0
  num_reward_opt_iters: 20
  disen_neg_rew_scale: 20000.0
  disen_rec_scale: 1.5
  disen_kl_scale: 1.0
  kl_scale: 1.0
  reward_scale: 1.0
  pcont_scale: 10.0
  weight_decay: 0.0
  weight_decay_pattern: .*
  batch_size: 50
  batch_length: 50
  train_every: 1000
  train_steps: 100
  pretrain: 100
  disen_reward_lr: 0.0006
  model_lr: 0.0006
  value_lr: 8.0e-05
  actor_lr: 8.0e-05
  grad_clip: 100.0
  dataset_balance: false
  discount: 0.99
  disclam: 0.95
  horizon: 15
  action_dist: tanh_normal
  action_init_std: 5.0
  expl: additive_gaussian
  expl_amount: 0.3
  expl_decay: 0.0
  expl_min: 0.0
 debug:
  debug: True
  pretrain: 1
  prefill: 1
  train_steps: 1
  batch_size: 10
  batch_length: 20
--- a/Dreamer/wrappers.py
+++ b/Dreamer/wrappers.py
@ -0,0 +1,540 @@
 import atexit
 import functools
 import sys
 import threading
 import traceback
 import gym
 import numpy as np
 from PIL import Image
 from collections import deque
 from numpy.core import overrides
 class DMC2GYMWrapper:
    def __init__(self, env):
        self._env = env
    def __getattr__(self, name):
        return getattr(self._env, name)
    @property
    def observation_space(self):
        spaces = {}
        spaces['image'] = gym.spaces.Box(
            0, 255, (self._env._height, self._env._width, 3,), dtype=np.uint8)
        return gym.spaces.Dict(spaces)
    def step(self, action):
        image, reward, done, info = self._env.step(action)
        obs = {'image': image}
        return obs, reward, done, info
    def reset(self):
        image = self._env.reset()
        obs = {'image': image}
        return obs
 class DeepMindControl:
    def __init__(self, name, size=(64, 64), camera=None):
        domain, task = name.split('_', 1)
        if domain == 'cup':  # Only domain with multiple words.
            domain = 'ball_in_cup'
        if isinstance(domain, str):
            from dm_control import suite
            self._env = suite.load(domain, task)
        else:
            assert task is None
            self._env = domain()
        self._size = size
        if camera is None:
            camera = dict(quadruped=2).get(domain, 0)
        self._camera = camera
    @property
    def observation_space(self):
        spaces = {}
        for key, value in self._env.observation_spec().items():
            spaces[key] = gym.spaces.Box(
                -np.inf, np.inf, value.shape, dtype=np.float32)
        spaces['image'] = gym.spaces.Box(
            0, 255, self._size + (3,), dtype=np.uint8)
        return gym.spaces.Dict(spaces)
    @property
    def action_space(self):
        spec = self._env.action_spec()
        return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32)
    def step(self, action):
        time_step = self._env.step(action)
        obs = dict(time_step.observation)
        obs['image'] = self.render()
        reward = time_step.reward or 0
        done = time_step.last()
        info = {'discount': np.array(time_step.discount, np.float32)}
        return obs, reward, done, info
    def reset(self):
        time_step = self._env.reset()
        obs = dict(time_step.observation)
        obs['image'] = self.render()
        return obs
    def render(self, *args, **kwargs):
        if kwargs.get('mode', 'rgb_array') != 'rgb_array':
            raise ValueError("Only render mode 'rgb_array' is supported.")
        return self._env.physics.render(*self._size, camera_id=self._camera)
 class Atari:
    LOCK = threading.Lock()
    def __init__(
            self, name, action_repeat=4, size=(84, 84), grayscale=True, noops=30,
            life_done=False, sticky_actions=True):
        import gym
        version = 0 if sticky_actions else 4
        name = ''.join(word.title() for word in name.split('_'))
        with self.LOCK:
            self._env = gym.make('{}NoFrameskip-v{}'.format(name, version))
        self._action_repeat = action_repeat
        self._size = size
        self._grayscale = grayscale
        self._noops = noops
        self._life_done = life_done
        self._lives = None
        shape = self._env.observation_space.shape[:2] + \
            (() if grayscale else (3,))
        self._buffers = [np.empty(shape, dtype=np.uint8) for _ in range(2)]
        self._random = np.random.RandomState(seed=None)
    @property
    def observation_space(self):
        shape = self._size + (1 if self._grayscale else 3,)
        space = gym.spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
        return gym.spaces.Dict({'image': space})
    @property
    def action_space(self):
        return self._env.action_space
    def close(self):
        return self._env.close()
    def reset(self):
        with self.LOCK:
            self._env.reset()
        noops = self._random.randint(1, self._noops + 1)
        for _ in range(noops):
            done = self._env.step(0)[2]
            if done:
                with self.LOCK:
                    self._env.reset()
        self._lives = self._env.ale.lives()
        if self._grayscale:
            self._env.ale.getScreenGrayscale(self._buffers[0])
        else:
            self._env.ale.getScreenRGB2(self._buffers[0])
        self._buffers[1].fill(0)
        return self._get_obs()
    def step(self, action):
        total_reward = 0.0
        for step in range(self._action_repeat):
            _, reward, done, info = self._env.step(action)
            total_reward += reward
            if self._life_done:
                lives = self._env.ale.lives()
                done = done or lives < self._lives
                self._lives = lives
            if done:
                break
            elif step >= self._action_repeat - 2:
                index = step - (self._action_repeat - 2)
                if self._grayscale:
                    self._env.ale.getScreenGrayscale(self._buffers[index])
                else:
                    self._env.ale.getScreenRGB2(self._buffers[index])
        obs = self._get_obs()
        return obs, total_reward, done, info
    def render(self, mode):
        return self._env.render(mode)
    def _get_obs(self):
        if self._action_repeat > 1:
            np.maximum(self._buffers[0],
                       self._buffers[1], out=self._buffers[0])
        image = np.array(Image.fromarray(self._buffers[0]).resize(
            self._size, Image.BILINEAR))
        image = np.clip(image, 0, 255).astype(np.uint8)
        image = image[:, :, None] if self._grayscale else image
        return {'image': image}
 class Collect:
    def __init__(self, env, callbacks=None, precision=32):
        self._env = env
        self._callbacks = callbacks or ()
        self._precision = precision
        self._episode = None
    def __getattr__(self, name):
        return getattr(self._env, name)
    def step(self, action):
        obs, reward, done, info = self._env.step(action)
        obs = {k: self._convert(v) for k, v in obs.items()}
        transition = obs.copy()
        transition['action'] = action
        transition['reward'] = reward
        transition['discount'] = info.get(
            'discount', np.array(1 - float(done)))
        self._episode.append(transition)
        if done:
            episode = {k: [t[k] for t in self._episode]
                       for k in self._episode[0]}
            episode = {k: self._convert(v) for k, v in episode.items()}
            info['episode'] = episode
            for callback in self._callbacks:
                callback(episode)
        return obs, reward, done, info
    def reset(self):
        obs = self._env.reset()
        transition = obs.copy()
        transition['action'] = np.zeros(self._env.action_space.shape)
        transition['reward'] = 0.0
        transition['discount'] = 1.0
        self._episode = [transition]
        return obs
    def _convert(self, value):
        value = np.array(value)
        if np.issubdtype(value.dtype, np.floating):
            dtype = {16: np.float16, 32: np.float32,
                     64: np.float64}[self._precision]
        elif np.issubdtype(value.dtype, np.signedinteger):
            dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision]
        elif np.issubdtype(value.dtype, np.uint8):
            dtype = np.uint8
        else:
            raise NotImplementedError(value.dtype)
        return value.astype(dtype)
 class TimeLimit:
    def __init__(self, env, duration):
        self._env = env
        self._duration = duration
        self._step = None
    def __getattr__(self, name):
        return getattr(self._env, name)
    def step(self, action):
        assert self._step is not None, 'Must reset environment.'
        obs, reward, done, info = self._env.step(action)
        self._step += 1
        if self._step >= self._duration:
            done = True
            if 'discount' not in info:
                info['discount'] = np.array(1.0).astype(np.float32)
            self._step = None
        return obs, reward, done, info
    def reset(self):
        self._step = 0
        return self._env.reset()
 class ActionRepeat:
    def __init__(self, env, amount):
        self._env = env
        self._amount = amount
    def __getattr__(self, name):
        return getattr(self._env, name)
    def step(self, action):
        done = False
        total_reward = 0
        current_step = 0
        while current_step < self._amount and not done:
            obs, reward, done, info = self._env.step(action)
            total_reward += reward
            current_step += 1
        return obs, total_reward, done, info
 class NormalizeActions:
    def __init__(self, env):
        self._env = env
        self._mask = np.logical_and(
            np.isfinite(env.action_space.low),
            np.isfinite(env.action_space.high))
        self._low = np.where(self._mask, env.action_space.low, -1)
        self._high = np.where(self._mask, env.action_space.high, 1)
    def __getattr__(self, name):
        return getattr(self._env, name)
    @property
    def action_space(self):
        low = np.where(self._mask, -np.ones_like(self._low), self._low)
        high = np.where(self._mask, np.ones_like(self._low), self._high)
        return gym.spaces.Box(low, high, dtype=np.float32)
    def step(self, action):
        original = (action + 1) / 2 * (self._high - self._low) + self._low
        original = np.where(self._mask, original, action)
        return self._env.step(original)
 class ObsDict:
    def __init__(self, env, key='obs'):
        self._env = env
        self._key = key
    def __getattr__(self, name):
        return getattr(self._env, name)
    @property
    def observation_space(self):
        spaces = {self._key: self._env.observation_space}
        return gym.spaces.Dict(spaces)
    @property
    def action_space(self):
        return self._env.action_space
    def step(self, action):
        obs, reward, done, info = self._env.step(action)
        obs = {self._key: np.array(obs)}
        return obs, reward, done, info
    def reset(self):
        obs = self._env.reset()
        obs = {self._key: np.array(obs)}
        return obs
 class OneHotAction:
    def __init__(self, env):
        assert isinstance(env.action_space, gym.spaces.Discrete)
        self._env = env
    def __getattr__(self, name):
        return getattr(self._env, name)
    @property
    def action_space(self):
        shape = (self._env.action_space.n,)
        space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32)
        space.sample = self._sample_action
        return space
    def step(self, action):
        index = np.argmax(action).astype(int)
        reference = np.zeros_like(action)
        reference[index] = 1
        if not np.allclose(reference, action):
            raise ValueError(f'Invalid one-hot action:\n{action}')
        return self._env.step(index)
    def reset(self):
        return self._env.reset()
    def _sample_action(self):
        actions = self._env.action_space.n
        index = self._random.randint(0, actions)
        reference = np.zeros(actions, dtype=np.float32)
        reference[index] = 1.0
        return reference
 class RewardObs:
    def __init__(self, env):
        self._env = env
    def __getattr__(self, name):
        return getattr(self._env, name)
    @property
    def observation_space(self):
        spaces = self._env.observation_space.spaces
        assert 'reward' not in spaces
        spaces['reward'] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32)
        return gym.spaces.Dict(spaces)
    def step(self, action):
        obs, reward, done, info = self._env.step(action)
        obs['reward'] = reward
        return obs, reward, done, info
    def reset(self):
        obs = self._env.reset()
        obs['reward'] = 0.0
        return obs
 class Async:
    _ACCESS = 1
    _CALL = 2
    _RESULT = 3
    _EXCEPTION = 4
    _CLOSE = 5
    def __init__(self, ctor, strategy='process'):
        self._strategy = strategy
        if strategy == 'none':
            self._env = ctor()
        elif strategy == 'thread':
            import multiprocessing.dummy as mp
        elif strategy == 'process':
            import multiprocessing as mp
        else:
            raise NotImplementedError(strategy)
        if strategy != 'none':
            self._conn, conn = mp.Pipe()
            self._process = mp.Process(target=self._worker, args=(ctor, conn))
            atexit.register(self.close)
            self._process.start()
        self._obs_space = None
        self._action_space = None
    @property
    def observation_space(self):
        if not self._obs_space:
            self._obs_space = self.__getattr__('observation_space')
        return self._obs_space
    @property
    def action_space(self):
        if not self._action_space:
            self._action_space = self.__getattr__('action_space')
        return self._action_space
    def __getattr__(self, name):
        if self._strategy == 'none':
            return getattr(self._env, name)
        self._conn.send((self._ACCESS, name))
        return self._receive()
    def call(self, name, *args, **kwargs):
        blocking = kwargs.pop('blocking', True)
        if self._strategy == 'none':
            return functools.partial(getattr(self._env, name), *args, **kwargs)
        payload = name, args, kwargs
        self._conn.send((self._CALL, payload))
        promise = self._receive
        return promise() if blocking else promise
    def close(self):
        if self._strategy == 'none':
            try:
                self._env.close()
            except AttributeError:
                pass
            return
        try:
            self._conn.send((self._CLOSE, None))
            self._conn.close()
        except IOError:
            # The connection was already closed.
            pass
        self._process.join()
    def step(self, action, blocking=True):
        return self.call('step', action, blocking=blocking)
    def reset(self, blocking=True):
        return self.call('reset', blocking=blocking)
    def _receive(self):
        try:
            message, payload = self._conn.recv()
        except ConnectionResetError:
            raise RuntimeError('Environment worker crashed.')
        # Re-raise exceptions in the main process.
        if message == self._EXCEPTION:
            stacktrace = payload
            raise Exception(stacktrace)
        if message == self._RESULT:
            return payload
        raise KeyError(f'Received message of unexpected type {message}')
    def _worker(self, ctor, conn):
        try:
            env = ctor()
            while True:
                try:
                    # Only block for short times to have keyboard exceptions be raised.
                    if not conn.poll(0.1):
                        continue
                    message, payload = conn.recv()
                except (EOFError, KeyboardInterrupt):
                    break
                if message == self._ACCESS:
                    name = payload
                    result = getattr(env, name)
                    conn.send((self._RESULT, result))
                    continue
                if message == self._CALL:
                    name, args, kwargs = payload
                    result = getattr(env, name)(*args, **kwargs)
                    conn.send((self._RESULT, result))
                    continue
                if message == self._CLOSE:
                    assert payload is None
                    break
                raise KeyError(f'Received message of unknown type {message}')
        except Exception:
            stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
            print(f'Error in environment process: {stacktrace}')
            conn.send((self._EXCEPTION, stacktrace))
        conn.close()
 class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        gym.Wrapper.__init__(self, env)
        self._k = k
        self._frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0,
            high=1,
            shape=((shp[0] * k,) + shp[1:]),
            dtype=env.observation_space.dtype
        )
        self._max_episode_steps = env._max_episode_steps
    def reset(self):
        obs = self.env.reset()
        for _ in range(self._k):
            self._frames.append(obs)
        return self._get_obs()
    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self._frames.append(obs)
        return self._get_obs(), reward, done, info
    def _get_obs(self):
        assert len(self._frames) == self._k
        return np.concatenate(list(self._frames), axis=0)
--- a/DreamerV2/configs.yaml
+++ b/DreamerV2/configs.yaml
@ -0,0 +1,185 @@
 defaults:
  gpu: 'none'
  logdir: ./
  traindir: null
  evaldir: null
  offline_traindir: ''
  offline_evaldir: ''
  seed: 0
  steps: 1e7
  eval_every: 1e4
  log_every: 1e4
  reset_every: 0
  gpu_growth: True
  precision: 32
  debug: False
  expl_gifs: False
  # Environment
  task: 'dmc_walker_walk'
  size: [64, 64]
  envs: 1
  action_repeat: 2
  time_limit: 1000
  prefill: 2500
  eval_noise: 0.0
  clip_rewards: 'identity'
  atari_grayscale: False
  # Model
  dyn_cell: 'gru'
  dyn_hidden: 200
  dyn_deter: 200
  dyn_stoch: 50
  dyn_discrete: 0
  dyn_input_layers: 1
  dyn_output_layers: 1
  dyn_shared: False
  dyn_mean_act: 'none'
  dyn_std_act: 'sigmoid2'
  dyn_min_std: 0.1
  grad_heads: ['image', 'reward']
  units: 400
  reward_layers: 2
  discount_layers: 3
  value_layers: 3
  actor_layers: 4
  act: 'elu'
  cnn_depth: 32
  encoder_kernels: [4, 4, 4, 4]
  decoder_kernels: [5, 5, 6, 6]
  decoder_thin: True
  value_head: 'normal'
  kl_scale: '1.0'
  kl_balance: '0.8'
  kl_free: '1.0'
  pred_discount: False
  discount_scale: 1.0
  reward_scale: 1.0
  weight_decay: 0.0
  # Training
  batch_size: 50
  batch_length: 50
  train_every: 5
  train_steps: 1
  pretrain: 100
  model_lr: 3e-4
  value_lr: 8e-5
  actor_lr: 8e-5
  opt_eps: 1e-5
  grad_clip: 100
  value_grad_clip: 100
  actor_grad_clip: 100
  dataset_size: 0
  oversample_ends: False
  slow_value_target: True
  slow_actor_target: True
  slow_target_update: 100
  slow_target_fraction: 1
  opt: 'adam'
  # Behavior.
  discount: 0.99
  discount_lambda: 0.95
  imag_horizon: 15
  imag_gradient: 'dynamics'
  imag_gradient_mix: '0.1'
  imag_sample: True
  actor_dist: 'trunc_normal'
  actor_entropy: '1e-4'
  actor_state_entropy: 0.0
  actor_init_std: 1.0
  actor_min_std: 0.1
  actor_disc: 5
  actor_temp: 0.1
  actor_outscale: 0.0
  expl_amount: 0.0
  eval_state_mean: False
  collect_dyn_sample: True
  behavior_stop_grad: True
  value_decay: 0.0
  future_entropy: False
  # Exploration
  expl_behavior: 'greedy'
  expl_until: 0
  expl_extr_scale: 0.0
  expl_intr_scale: 1.0
  disag_target: 'stoch'
  disag_log: True
  disag_models: 10
  disag_offset: 1
  disag_layers: 4
  disag_units: 400
 atari:
  # General
  task: 'atari_demon_attack'
  steps: 3e7
  eval_every: 1e5
  log_every: 1e4
  prefill: 50000
  dataset_size: 2e6
  pretrain: 0
  precision: 16
  # Environment
  time_limit: 108000  # 30 minutes of game play.
  atari_grayscale: True
  action_repeat: 4
  eval_noise: 0.001
  train_every: 16
  train_steps: 1
  clip_rewards: 'tanh'
  # Model
  grad_heads: ['image', 'reward', 'discount']
  dyn_cell: 'gru_layer_norm'
  pred_discount: True
  cnn_depth: 48
  dyn_deter: 600
  dyn_hidden: 600
  dyn_stoch: 32
  dyn_discrete: 32
  reward_layers: 4
  discount_layers: 4
  value_layers: 4
  actor_layers: 4
  # Behavior
  actor_dist: 'onehot'
  actor_entropy: 'linear(3e-3,3e-4,2.5e6)'
  expl_amount: 0.0
  expl_until: 3e7
  discount: 0.995
  imag_gradient: 'both'
  imag_gradient_mix: 'linear(0.1,0,2.5e6)'
  # Training
  discount_scale: 5.0
  reward_scale: 1
  weight_decay: 1e-6
  model_lr: 2e-4
  kl_scale: 0.1
  kl_free: 0.0
  actor_lr: 4e-5
  value_lr: 1e-4
  oversample_ends: True
  # Disen
  disen_cnn_depth: 16
  disen_only_scale: 1.0
  disen_discount_scale: 2000.0
  disen_reward_scale: 2000.0
  num_reward_opt_iters: 20
 debug:
  debug: True
  pretrain: 1
  prefill: 1
  train_steps: 1
  batch_size: 10
  batch_length: 20
--- a/DreamerV2/dreamer.py
+++ b/DreamerV2/dreamer.py
@ -0,0 +1,316 @@
 import argparse
 import collections
 import functools
 import os
 import pathlib
 import sys
 import warnings
 warnings.filterwarnings('ignore', '.*box bound precision lowered.*')
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 os.environ['MUJOCO_GL'] = 'egl'
 import numpy as np
 import ruamel.yaml as yaml
 import tensorflow as tf
 from tensorflow.keras.mixed_precision import experimental as prec
 tf.get_logger().setLevel('ERROR')
 from tensorflow_probability import distributions as tfd
 sys.path.append(str(pathlib.Path(__file__).parent))
 import exploration as expl
 import models
 import tools
 import wrappers
 class Dreamer(tools.Module):
  def __init__(self, config, logger, dataset):
    self._config = config
    self._logger = logger
    self._float = prec.global_policy().compute_dtype
    self._should_log = tools.Every(config.log_every)
    self._should_train = tools.Every(config.train_every)
    self._should_pretrain = tools.Once()
    self._should_reset = tools.Every(config.reset_every)
    self._should_expl = tools.Until(int(
        config.expl_until / config.action_repeat))
    self._metrics = collections.defaultdict(tf.metrics.Mean)
    with tf.device('cpu:0'):
      self._step = tf.Variable(count_steps(config.traindir), dtype=tf.int64)
    # Schedules.
    config.actor_entropy = (
        lambda x=config.actor_entropy: tools.schedule(x, self._step))
    config.actor_state_entropy = (
        lambda x=config.actor_state_entropy: tools.schedule(x, self._step))
    config.imag_gradient_mix = (
        lambda x=config.imag_gradient_mix: tools.schedule(x, self._step))
    self._dataset = iter(dataset)
    self._wm = models.WorldModel(self._step, config)
    self._task_behavior = models.ImagBehavior(
        config, self._wm, config.behavior_stop_grad)
    reward = lambda f, s, a: self._wm.heads['reward'](f).mode()
    self._expl_behavior = dict(
        greedy=lambda: self._task_behavior,
        random=lambda: expl.Random(config),
        plan2explore=lambda: expl.Plan2Explore(config, self._wm, reward),
    )[config.expl_behavior]()
    # Train step to initialize variables including optimizer statistics.
    self._train(next(self._dataset))
  def __call__(self, obs, reset, state=None, training=True):
    step = self._step.numpy().item()
    if self._should_reset(step):
      state = None
    if state is not None and reset.any():
      mask = tf.cast(1 - reset, self._float)[:, None]
      state = tf.nest.map_structure(lambda x: x * mask, state)
    if training and self._should_train(step):
      steps = (
          self._config.pretrain if self._should_pretrain()
          else self._config.train_steps)
      for _ in range(steps):
        self._train(next(self._dataset))
      if self._should_log(step):
        for name, mean in self._metrics.items():
          self._logger.scalar(name, float(mean.result()))
          mean.reset_states()
        openl_joint, openl_main, openl_disen, openl_mask = self._wm.video_pred(next(self._dataset))
        self._logger.video('train_openl_joint', openl_joint)
        self._logger.video('train_openl_main', openl_main)
        self._logger.video('train_openl_disen', openl_disen)
        self._logger.video('train_openl_mask', openl_mask)
        self._logger.write(fps=True)
    action, state = self._policy(obs, state, training)
    if training:
      self._step.assign_add(len(reset))
      self._logger.step = self._config.action_repeat \
          * self._step.numpy().item()
    return action, state
  @tf.function
  def _policy(self, obs, state, training):
    if state is None:
      batch_size = len(obs['image'])
      latent = self._wm.dynamics.initial(len(obs['image']))
      action = tf.zeros((batch_size, self._config.num_actions), self._float)
    else:
      latent, action = state
    embed = self._wm.encoder(self._wm.preprocess(obs))
    latent, _ = self._wm.dynamics.obs_step(
        latent, action, embed, self._config.collect_dyn_sample)
    if self._config.eval_state_mean:
      latent['stoch'] = latent['mean']
    feat = self._wm.dynamics.get_feat(latent)
    if not training:
      action = self._task_behavior.actor(feat).mode()
    elif self._should_expl(self._step):
      action = self._expl_behavior.actor(feat).sample()
    else:
      action = self._task_behavior.actor(feat).sample()
    if self._config.actor_dist == 'onehot_gumble':
      action = tf.cast(
          tf.one_hot(tf.argmax(action, axis=-1), self._config.num_actions),
          action.dtype)
    action = self._exploration(action, training)
    state = (latent, action)
    return action, state
  def _exploration(self, action, training):
    amount = self._config.expl_amount if training else self._config.eval_noise
    if amount == 0:
      return action
    amount = tf.cast(amount, self._float)
    if 'onehot' in self._config.actor_dist:
      probs = amount / self._config.num_actions + (1 - amount) * action
      return tools.OneHotDist(probs=probs).sample()
    else:
      return tf.clip_by_value(tfd.Normal(action, amount).sample(), -1, 1)
    raise NotImplementedError(self._config.action_noise)
  @tf.function
  def _train(self, data):
    print('Tracing train function.')
    metrics = {}
    embed, post, feat, kl, mets = self._wm.train(data)
    metrics.update(mets)
    start = post
    if self._config.pred_discount:  # Last step could be terminal.
      start = {k: v[:, :-1] for k, v in post.items()}
      embed, feat, kl = embed[:, :-1], feat[:, :-1], kl[:, :-1]
    reward = lambda f, s, a: self._wm.heads['reward'](f).mode()
    metrics.update(self._task_behavior.train(start, reward)[-1])
    if self._config.expl_behavior != 'greedy':
      mets = self._expl_behavior.train(start, feat, embed, kl)[-1]
      metrics.update({'expl_' + key: value for key, value in mets.items()})
    for name, value in metrics.items():
      self._metrics[name].update_state(value)
 def count_steps(folder):
  return sum(int(str(n).split('-')[-1][:-4]) - 1 for n in folder.glob('*.npz'))
 def make_dataset(episodes, config):
  example = episodes[next(iter(episodes.keys()))]
  types = {k: v.dtype for k, v in example.items()}
  shapes = {k: (None,) + v.shape[1:] for k, v in example.items()}
  generator = lambda: tools.sample_episodes(
      episodes, config.batch_length, config.oversample_ends)
  dataset = tf.data.Dataset.from_generator(generator, types, shapes)
  dataset = dataset.batch(config.batch_size, drop_remainder=True)
  dataset = dataset.prefetch(10)
  return dataset
 def make_env(config, logger, mode, train_eps, eval_eps):
  suite, task = config.task.split('_', 1)
  if suite == 'dmc':
    env = wrappers.DeepMindControl(task, config.action_repeat, config.size)
    env = wrappers.NormalizeActions(env)
  elif suite == 'atari':
    env = wrappers.Atari(
        task, config.action_repeat, config.size,
        grayscale=config.atari_grayscale,
        life_done=False and (mode == 'train'),
        sticky_actions=True,
        all_actions=True)
    env = wrappers.OneHotAction(env)
  else:
    raise NotImplementedError(suite)
  env = wrappers.TimeLimit(env, config.time_limit)
  callbacks = [functools.partial(
      process_episode, config, logger, mode, train_eps, eval_eps)]
  env = wrappers.CollectDataset(env, callbacks)
  env = wrappers.RewardObs(env)
  return env
 def process_episode(config, logger, mode, train_eps, eval_eps, episode):
  directory = dict(train=config.traindir, eval=config.evaldir)[mode]
  cache = dict(train=train_eps, eval=eval_eps)[mode]
  filename = tools.save_episodes(directory, [episode])[0]
  length = len(episode['reward']) - 1
  score = float(episode['reward'].astype(np.float64).sum())
  video = episode['image']
  if mode == 'eval':
    cache.clear()
  if mode == 'train' and config.dataset_size:
    total = 0
    for key, ep in reversed(sorted(cache.items(), key=lambda x: x[0])):
      if total <= config.dataset_size - length:
        total += len(ep['reward']) - 1
      else:
        del cache[key]
    logger.scalar('dataset_size', total + length)
  cache[str(filename)] = episode
  print(f'{mode.title()} episode has {length} steps and return {score:.1f}.')
  logger.scalar(f'{mode}_return', score)
  logger.scalar(f'{mode}_length', length)
  logger.scalar(f'{mode}_episodes', len(cache))
  if mode == 'eval' or config.expl_gifs:
    logger.video(f'{mode}_policy', video[None])
  logger.write()
 def main(logdir, config):
  logdir = os.path.join(
      logdir, config.task, 'Ours', str(config.seed))
  logdir = pathlib.Path(logdir).expanduser()
  config.traindir = config.traindir or logdir / 'train_eps'
  config.evaldir = config.evaldir or logdir / 'eval_eps'
  config.steps //= config.action_repeat
  config.eval_every //= config.action_repeat
  config.log_every //= config.action_repeat
  config.time_limit //= config.action_repeat
  config.act = getattr(tf.nn, config.act)
  if config.debug:
    tf.config.experimental_run_functions_eagerly(True)
  if config.gpu_growth:
    message = 'No GPU found. To actually train on CPU remove this assert.'
    assert tf.config.experimental.list_physical_devices('GPU'), message
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
      tf.config.experimental.set_memory_growth(gpu, True)
  assert config.precision in (16, 32), config.precision
  if config.precision == 16:
    prec.set_policy(prec.Policy('mixed_float16'))
  print('Logdir', logdir)
  logdir.mkdir(parents=True, exist_ok=True)
  config.traindir.mkdir(parents=True, exist_ok=True)
  config.evaldir.mkdir(parents=True, exist_ok=True)
  step = count_steps(config.traindir)
  logger = tools.Logger(logdir, config.action_repeat * step)
  print('Create envs.')
  if config.offline_traindir:
    directory = config.offline_traindir.format(**vars(config))
  else:
    directory = config.traindir
  train_eps = tools.load_episodes(directory, limit=config.dataset_size)
  if config.offline_evaldir:
    directory = config.offline_evaldir.format(**vars(config))
  else:
    directory = config.evaldir
  eval_eps = tools.load_episodes(directory, limit=1)
  make = lambda mode: make_env(config, logger, mode, train_eps, eval_eps)
  train_envs = [make('train') for _ in range(config.envs)]
  eval_envs = [make('eval') for _ in range(config.envs)]
  acts = train_envs[0].action_space
  config.num_actions = acts.n if hasattr(acts, 'n') else acts.shape[0]
  prefill = max(0, config.prefill - count_steps(config.traindir))
  print(f'Prefill dataset ({prefill} steps).')
  random_agent = lambda o, d, s: ([acts.sample() for _ in d], s)
  tools.simulate(random_agent, train_envs, prefill)
  tools.simulate(random_agent, eval_envs, episodes=1)
  logger.step = config.action_repeat * count_steps(config.traindir)
  print('Simulate agent.')
  train_dataset = make_dataset(train_eps, config)
  eval_dataset = iter(make_dataset(eval_eps, config))
  agent = Dreamer(config, logger, train_dataset)
  if (logdir / 'variables.pkl').exists():
    agent.load(logdir / 'variables.pkl')
    agent._should_pretrain._once = False
  state = None
  suite, task = config.task.split('_', 1)
  num_eval_episodes = 10 if suite == 'procgen' else 1
  while agent._step.numpy().item() < config.steps:
    logger.write()
    print('Start evaluation.')
    openl_joint, openl_main, openl_disen, openl_mask = agent._wm.video_pred(next(eval_dataset))
    logger.video('eval_openl_joint', openl_joint)
    logger.video('eval_openl_main', openl_main)
    logger.video('eval_openl_disen', openl_disen)
    logger.video('eval_openl_mask', openl_mask)
    eval_policy = functools.partial(agent, training=False)
    tools.simulate(eval_policy, eval_envs, episodes=num_eval_episodes)
    print('Start training.')
    state = tools.simulate(agent, train_envs, config.eval_every, state=state)
    agent.save(logdir / 'variables.pkl')
  for env in train_envs + eval_envs:
    try:
      env.close()
    except Exception:
      pass
 if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('--configs', nargs='+', required=True)
  args, remaining = parser.parse_known_args()
  configs = yaml.safe_load((pathlib.Path(__file__).parent / 'configs.yaml').read_text())
  config_ = {}
  for name in args.configs:
    config_.update(configs[name])
  parser = argparse.ArgumentParser()
  for key, value in config_.items():
    arg_type = tools.args_type(value)
    parser.add_argument(f'--{key}', type=arg_type, default=arg_type(value))
  main(config_['logdir'], parser.parse_args(remaining))
--- a/DreamerV2/exploration.py
+++ b/DreamerV2/exploration.py
@ -0,0 +1,83 @@
 import tensorflow as tf
 from tensorflow.keras.mixed_precision import experimental as prec
 from tensorflow_probability import distributions as tfd
 import models
 import networks
 import tools
 class Random(tools.Module):
  def __init__(self, config):
    self._config = config
    self._float = prec.global_policy().comput_dtype
  def actor(self, feat):
    shape = feat.shape[:-1] + [self._config.num_actions]
    if self._config.actor_dist == 'onehot':
      return tools.OneHotDist(tf.zeros(shape))
    else:
      ones = tf.ones(shape, self._float)
      return tfd.Uniform(-ones, ones)
  def train(self, start, feat, embed, kl):
    return None, {}
 class Plan2Explore(tools.Module):
  def __init__(self, config, world_model, reward=None):
    self._config = config
    self._reward = reward
    self._behavior = models.ImagBehavior(config, world_model)
    self.actor = self._behavior.actor
    size = {
        'embed': 32 * config.cnn_depth,
        'stoch': config.dyn_stoch,
        'deter': config.dyn_deter,
        'feat': config.dyn_stoch + config.dyn_deter,
    }[self._config.disag_target]
    kw = dict(
        shape=size, layers=config.disag_layers, units=config.disag_units,
        act=config.act)
    self._networks = [
        networks.DenseHead(**kw) for _ in range(config.disag_models)]
    self._opt = tools.Optimizer(
        'ensemble', config.model_lr, config.opt_eps, config.grad_clip,
        config.weight_decay, opt=config.opt)
  def train(self, start, feat, embed, kl):
    metrics = {}
    target = {
        'embed': embed,
        'stoch': start['stoch'],
        'deter': start['deter'],
        'feat': feat,
    }[self._config.disag_target]
    metrics.update(self._train_ensemble(feat, target))
    metrics.update(self._behavior.train(start, self._intrinsic_reward)[-1])
    return None, metrics
  def _intrinsic_reward(self, feat, state, action):
    preds = [head(feat, tf.float32).mean() for head in self._networks]
    disag = tf.reduce_mean(tf.math.reduce_std(preds, 0), -1)
    if self._config.disag_log:
      disag = tf.math.log(disag)
    reward = self._config.expl_intr_scale * disag
    if self._config.expl_extr_scale:
      reward += tf.cast(self._config.expl_extr_scale * self._reward(
          feat, state, action), tf.float32)
    return reward
  def _train_ensemble(self, inputs, targets):
    if self._config.disag_offset:
      targets = targets[:, self._config.disag_offset:]
      inputs = inputs[:, :-self._config.disag_offset]
    targets = tf.stop_gradient(targets)
    inputs = tf.stop_gradient(inputs)
    with tf.GradientTape() as tape:
      preds = [head(inputs) for head in self._networks]
      likes = [tf.reduce_mean(pred.log_prob(targets)) for pred in preds]
      loss = -tf.cast(tf.reduce_sum(likes), tf.float32)
    metrics = self._opt(tape, loss, self._networks)
    return metrics
--- a/DreamerV2/models.py
+++ b/DreamerV2/models.py
@ -0,0 +1,429 @@
 import tensorflow as tf
 from tensorflow.keras.mixed_precision import experimental as prec
 import networks
 import tools
 class WorldModel(tools.Module):
    def __init__(self, step, config):
        self._step = step
        self._config = config
        channels = (1 if config.atari_grayscale else 3)
        shape = config.size + (channels,)
        ########
        # Main #
        ########
        self.encoder = networks.ConvEncoder(
            config.cnn_depth, config.act, config.encoder_kernels)
        self.dynamics = networks.RSSM(
            config.dyn_stoch, config.dyn_deter, config.dyn_hidden,
            config.dyn_input_layers, config.dyn_output_layers, config.dyn_shared,
            config.dyn_discrete, config.act, config.dyn_mean_act,
            config.dyn_std_act, config.dyn_min_std, config.dyn_cell)
        self.heads = {}
        self.heads['reward'] = networks.DenseHead(
            [], config.reward_layers, config.units, config.act)
        if config.pred_discount:
            self.heads['discount'] = networks.DenseHead(
                [], config.discount_layers, config.units, config.act, dist='binary')
        self._model_opt = tools.Optimizer(
            'model', config.model_lr, config.opt_eps, config.grad_clip,
            config.weight_decay, opt=config.opt)
        self._scales = dict(
            reward=config.reward_scale, discount=config.discount_scale)
        #########
        # Disen #
        #########
        self.disen_encoder = networks.ConvEncoder(
            config.disen_cnn_depth, config.act, config.encoder_kernels)
        self.disen_dynamics = networks.RSSM(
            config.dyn_stoch, config.dyn_deter, config.dyn_hidden,
            config.dyn_input_layers, config.dyn_output_layers, config.dyn_shared,
            config.dyn_discrete, config.act, config.dyn_mean_act,
            config.dyn_std_act, config.dyn_min_std, config.dyn_cell)
        self.disen_heads = {}
        self.disen_heads['reward'] = networks.DenseHead(
            [], config.reward_layers, config.units, config.act)
        if config.pred_discount:
            self.disen_heads['discount'] = networks.DenseHead(
                [], config.discount_layers, config.units, config.act, dist='binary')
        self._disen_model_opt = tools.Optimizer(
            'disen', config.model_lr, config.opt_eps, config.grad_clip,
            config.weight_decay, opt=config.opt)
        self._disen_heads_opt = {}
        self._disen_heads_opt['reward'] = tools.Optimizer(
            'disen_reward', config.model_lr, config.opt_eps, config.grad_clip,
            config.weight_decay, opt=config.opt)
        if config.pred_discount:
            self._disen_heads_opt['discount'] = tools.Optimizer(
                'disen_pcont', config.model_lr, config.opt_eps, config.grad_clip,
                config.weight_decay, opt=config.opt)
        # negative signs for reward/discount here
        self._disen_scales = dict(disen_only=config.disen_only_scale,
                                  reward=-config.disen_reward_scale, discount=-config.disen_discount_scale)
        self.disen_only_image_head = networks.ConvDecoder(
            config.disen_cnn_depth, config.act, shape, config.decoder_kernels,
            config.decoder_thin)
        ################
        # Joint Decode #
        ################
        self.image_head = networks.ConvDecoderMask(
            config.cnn_depth, config.act, shape, config.decoder_kernels,
            config.decoder_thin)
        self.disen_image_head = networks.ConvDecoderMask(
            config.disen_cnn_depth, config.act, shape, config.decoder_kernels,
            config.decoder_thin)
        self.joint_image_head = networks.ConvDecoderMaskEnsemble(
            self.image_head, self.disen_image_head
        )
    def train(self, data):
        data = self.preprocess(data)
        with tf.GradientTape() as model_tape, tf.GradientTape() as disen_tape:
            # kl schedule
            kl_balance = tools.schedule(self._config.kl_balance, self._step)
            kl_free = tools.schedule(self._config.kl_free, self._step)
            kl_scale = tools.schedule(self._config.kl_scale, self._step)
            # Main
            embed = self.encoder(data)
            post, prior = self.dynamics.observe(embed, data['action'])
            kl_loss, kl_value = self.dynamics.kl_loss(
                post, prior, kl_balance, kl_free, kl_scale)
            feat = self.dynamics.get_feat(post)
            likes = {}
            for name, head in self.heads.items():
                grad_head = (name in self._config.grad_heads)
                inp = feat if grad_head else tf.stop_gradient(feat)
                pred = head(inp, tf.float32)
                like = pred.log_prob(tf.cast(data[name], tf.float32))
                likes[name] = tf.reduce_mean(
                    like) * self._scales.get(name, 1.0)
            # Disen
            embed_disen = self.disen_encoder(data)
            post_disen, prior_disen = self.disen_dynamics.observe(
                embed_disen, data['action'])
            kl_loss_disen, kl_value_disen = self.dynamics.kl_loss(
                post_disen, prior_disen, kl_balance, kl_free, kl_scale)
            feat_disen = self.disen_dynamics.get_feat(post_disen)
            # Optimize disen reward/pcont till optimal
            disen_metrics = dict(reward={}, discount={})
            loss_disen = dict(reward=None, discount=None)
            for _ in range(self._config.num_reward_opt_iters):
                with tf.GradientTape() as disen_reward_tape, tf.GradientTape() as disen_pcont_tape:
                    disen_gradient_tapes = dict(
                        reward=disen_reward_tape, discount=disen_pcont_tape)
                    for name, head in self.disen_heads.items():
                        pred_disen = head(
                            tf.stop_gradient(feat_disen), tf.float32)
                        loss_disen[name] = -tf.reduce_mean(pred_disen.log_prob(
                            tf.cast(data[name], tf.float32)))
                for name, head in self.disen_heads.items():
                    disen_metrics[name] = self._disen_heads_opt[name](
                        disen_gradient_tapes[name], loss_disen[name], [head], prefix='disen_neg')
            # Compute likes for disen model (including negative gradients)
            likes_disen = {}
            for name, head in self.disen_heads.items():
                pred_disen = head(feat_disen, tf.float32)
                like_disen = pred_disen.log_prob(
                    tf.cast(data[name], tf.float32))
                likes_disen[name] = tf.reduce_mean(
                    like_disen) * self._disen_scales.get(name, -1.0)
            disen_only_image_pred = self.disen_only_image_head(
                feat_disen, tf.float32)
            disen_only_image_like = tf.reduce_mean(disen_only_image_pred.log_prob(
                tf.cast(data['image'], tf.float32))) * self._disen_scales.get('disen_only', 1.0)
            likes_disen['disen_only'] = disen_only_image_like
            # Joint decode
            image_pred_joint, _, _, _ = self.joint_image_head(
                feat, feat_disen, tf.float32)
            image_like = tf.reduce_mean(image_pred_joint.log_prob(
                tf.cast(data['image'], tf.float32)))
            likes['image'] = image_like
            likes_disen['image'] = image_like
            # Compute loss
            model_loss = kl_loss - sum(likes.values())
            disen_loss = kl_loss_disen - sum(likes_disen.values())
        model_parts = [self.encoder, self.dynamics,
                       self.joint_image_head] + list(self.heads.values())
        disen_parts = [self.disen_encoder, self.disen_dynamics,
                       self.joint_image_head, self.disen_only_image_head]
        metrics = self._model_opt(
            model_tape, model_loss, model_parts, prefix='main')
        disen_model_metrics = self._disen_model_opt(
            disen_tape, disen_loss, disen_parts, prefix='disen')
        metrics['kl_balance'] = kl_balance
        metrics['kl_free'] = kl_free
        metrics['kl_scale'] = kl_scale
        metrics.update({f'{name}_loss': -like for name,
                        like in likes.items()})
        metrics['disen/disen_only_image_loss'] = -disen_only_image_like
        metrics['disen/disen_reward_loss'] = -likes_disen['reward'] / \
            self._disen_scales.get('reward', -1.0)
        metrics['disen/disen_discount_loss'] = -likes_disen['discount'] / \
            self._disen_scales.get('discount', -1.0)
        metrics['kl'] = tf.reduce_mean(kl_value)
        metrics['prior_ent'] = self.dynamics.get_dist(prior).entropy()
        metrics['post_ent'] = self.dynamics.get_dist(post).entropy()
        metrics['disen/kl'] = tf.reduce_mean(kl_value_disen)
        metrics['disen/prior_ent'] = self.dynamics.get_dist(
            prior_disen).entropy()
        metrics['disen/post_ent'] = self.dynamics.get_dist(
            post_disen).entropy()
        metrics.update(
            {f'{key}': value for key, value in disen_metrics['reward'].items()})
        metrics.update(
            {f'{key}': value for key, value in disen_metrics['discount'].items()})
        metrics.update(
            {f'{key}': value for key, value in disen_model_metrics.items()})
        return embed, post, feat, kl_value, metrics
    @tf.function
    def preprocess(self, obs):
        dtype = prec.global_policy().compute_dtype
        obs = obs.copy()
        obs['image'] = tf.cast(obs['image'], dtype) / 255.0 - 0.5
        obs['reward'] = getattr(tf, self._config.clip_rewards)(obs['reward'])
        if 'discount' in obs:
            obs['discount'] *= self._config.discount
        for key, value in obs.items():
            if tf.dtypes.as_dtype(value.dtype) in (
                    tf.float16, tf.float32, tf.float64):
                obs[key] = tf.cast(value, dtype)
        return obs
    @tf.function
    def video_pred(self, data):
        data = self.preprocess(data)
        truth = data['image'][:6] + 0.5
        embed = self.encoder(data)
        embed_disen = self.disen_encoder(data)
        states, _ = self.dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        states_disen, _ = self.disen_dynamics.observe(
            embed_disen[:6, :5], data['action'][:6, :5])
        feats = self.dynamics.get_feat(states)
        feats_disen = self.disen_dynamics.get_feat(states_disen)
        recon_joint, recon_main, recon_disen, recon_mask = self.joint_image_head(
            feats, feats_disen)
        recon_joint = recon_joint.mode()[:6]
        recon_main = recon_main.mode()[:6]
        recon_disen = recon_disen.mode()[:6]
        recon_mask = recon_mask[:6]
        init = {k: v[:, -1] for k, v in states.items()}
        init_disen = {k: v[:, -1] for k, v in states_disen.items()}
        prior = self.dynamics.imagine(
            data['action'][:6, 5:], init)
        prior_disen = self.disen_dynamics.imagine(
            data['action'][:6, 5:], init_disen)
        _feats = self.dynamics.get_feat(prior)
        _feats_disen = self.disen_dynamics.get_feat(prior_disen)
        openl_joint, openl_main, openl_disen, openl_mask = self.joint_image_head(
            _feats, _feats_disen)
        openl_joint = openl_joint.mode()
        openl_main = openl_main.mode()
        openl_disen = openl_disen.mode()
        model_joint = tf.concat(
            [recon_joint[:, :5] + 0.5, openl_joint + 0.5], 1)
        error_joint = (model_joint - truth + 1) / 2
        model_main = tf.concat(
            [recon_main[:, :5] + 0.5, openl_main + 0.5], 1)
        error_main = (model_main - truth + 1) / 2
        model_disen = tf.concat(
            [recon_disen[:, :5] + 0.5, openl_disen + 0.5], 1)
        error_disen = (model_disen - truth + 1) / 2
        model_mask = tf.concat(
            [recon_mask[:, :5] + 0.5, openl_mask + 0.5], 1)
        output_joint = tf.concat([truth, model_joint, error_joint], 2)
        output_main = tf.concat([truth, model_main, error_main], 2)
        output_disen = tf.concat([truth, model_disen, error_disen], 2)
        output_mask = model_mask
        return output_joint, output_main, output_disen, output_mask
 class ImagBehavior(tools.Module):
    def __init__(self, config, world_model, stop_grad_actor=True, reward=None):
        self._config = config
        self._world_model = world_model
        self._stop_grad_actor = stop_grad_actor
        self._reward = reward
        self.actor = networks.ActionHead(
            config.num_actions, config.actor_layers, config.units, config.act,
            config.actor_dist, config.actor_init_std, config.actor_min_std,
            config.actor_dist, config.actor_temp, config.actor_outscale)
        self.value = networks.DenseHead(
            [], config.value_layers, config.units, config.act,
            config.value_head)
        if config.slow_value_target or config.slow_actor_target:
            self._slow_value = networks.DenseHead(
                [], config.value_layers, config.units, config.act)
            self._updates = tf.Variable(0, tf.int64)
        kw = dict(wd=config.weight_decay, opt=config.opt)
        self._actor_opt = tools.Optimizer(
            'actor', config.actor_lr, config.opt_eps, config.actor_grad_clip, **kw)
        self._value_opt = tools.Optimizer(
            'value', config.value_lr, config.opt_eps, config.value_grad_clip, **kw)
    def train(
            self, start, objective=None, imagine=None, tape=None, repeats=None):
        objective = objective or self._reward
        self._update_slow_target()
        metrics = {}
        with (tape or tf.GradientTape()) as actor_tape:
            assert bool(objective) != bool(imagine)
            if objective:
                imag_feat, imag_state, imag_action = self._imagine(
                    start, self.actor, self._config.imag_horizon, repeats)
                reward = objective(imag_feat, imag_state, imag_action)
            else:
                imag_feat, imag_state, imag_action, reward = imagine(start)
            actor_ent = self.actor(imag_feat, tf.float32).entropy()
            state_ent = self._world_model.dynamics.get_dist(
                imag_state, tf.float32).entropy()
            target, weights = self._compute_target(
                imag_feat, reward, actor_ent, state_ent,
                self._config.slow_actor_target)
            actor_loss, mets = self._compute_actor_loss(
                imag_feat, imag_state, imag_action, target, actor_ent, state_ent,
                weights)
            metrics.update(mets)
        if self._config.slow_value_target != self._config.slow_actor_target:
            target, weights = self._compute_target(
                imag_feat, reward, actor_ent, state_ent,
                self._config.slow_value_target)
        with tf.GradientTape() as value_tape:
            value = self.value(imag_feat, tf.float32)[:-1]
            value_loss = -value.log_prob(tf.stop_gradient(target))
            if self._config.value_decay:
                value_loss += self._config.value_decay * value.mode()
            value_loss = tf.reduce_mean(weights[:-1] * value_loss)
        metrics['reward_mean'] = tf.reduce_mean(reward)
        metrics['reward_std'] = tf.math.reduce_std(reward)
        metrics['actor_ent'] = tf.reduce_mean(actor_ent)
        metrics.update(self._actor_opt(actor_tape, actor_loss, [self.actor]))
        metrics.update(self._value_opt(value_tape, value_loss, [self.value]))
        return imag_feat, imag_state, imag_action, weights, metrics
    def _imagine(self, start, policy, horizon, repeats=None):
        dynamics = self._world_model.dynamics
        if repeats:
            start = {k: tf.repeat(v, repeats, axis=1)
                     for k, v in start.items()}
        def flatten(x): return tf.reshape(x, [-1] + list(x.shape[2:]))
        start = {k: flatten(v) for k, v in start.items()}
        def step(prev, _):
            state, _, _ = prev
            feat = dynamics.get_feat(state)
            inp = tf.stop_gradient(feat) if self._stop_grad_actor else feat
            action = policy(inp).sample()
            succ = dynamics.img_step(
                state, action, sample=self._config.imag_sample)
            return succ, feat, action
        feat = 0 * dynamics.get_feat(start)
        action = policy(feat).mode()
        succ, feats, actions = tools.static_scan(
            step, tf.range(horizon), (start, feat, action))
        states = {k: tf.concat([
            start[k][None], v[:-1]], 0) for k, v in succ.items()}
        if repeats:
            def unfold(tensor):
                s = tensor.shape
                return tf.reshape(tensor, [s[0], s[1] // repeats, repeats] + s[2:])
            states, feats, actions = tf.nest.map_structure(
                unfold, (states, feats, actions))
        return feats, states, actions
    def _compute_target(self, imag_feat, reward, actor_ent, state_ent, slow):
        reward = tf.cast(reward, tf.float32)
        if 'discount' in self._world_model.heads:
            discount = self._world_model.heads['discount'](
                imag_feat, tf.float32).mean()
        else:
            discount = self._config.discount * tf.ones_like(reward)
        if self._config.future_entropy and tf.greater(
                self._config.actor_entropy(), 0):
            reward += self._config.actor_entropy() * actor_ent
        if self._config.future_entropy and tf.greater(
                self._config.actor_state_entropy(), 0):
            reward += self._config.actor_state_entropy() * state_ent
        if slow:
            value = self._slow_value(imag_feat, tf.float32).mode()
        else:
            value = self.value(imag_feat, tf.float32).mode()
        target = tools.lambda_return(
            reward[:-1], value[:-1], discount[:-1],
            bootstrap=value[-1], lambda_=self._config.discount_lambda, axis=0)
        weights = tf.stop_gradient(tf.math.cumprod(tf.concat(
            [tf.ones_like(discount[:1]), discount[:-1]], 0), 0))
        return target, weights
    def _compute_actor_loss(
            self, imag_feat, imag_state, imag_action, target, actor_ent, state_ent,
            weights):
        metrics = {}
        inp = tf.stop_gradient(
            imag_feat) if self._stop_grad_actor else imag_feat
        policy = self.actor(inp, tf.float32)
        actor_ent = policy.entropy()
        if self._config.imag_gradient == 'dynamics':
            actor_target = target
        elif self._config.imag_gradient == 'reinforce':
            imag_action = tf.cast(imag_action, tf.float32)
            actor_target = policy.log_prob(imag_action)[:-1] * tf.stop_gradient(
                target - self.value(imag_feat[:-1], tf.float32).mode())
        elif self._config.imag_gradient == 'both':
            imag_action = tf.cast(imag_action, tf.float32)
            actor_target = policy.log_prob(imag_action)[:-1] * tf.stop_gradient(
                target - self.value(imag_feat[:-1], tf.float32).mode())
            mix = self._config.imag_gradient_mix()
            actor_target = mix * target + (1 - mix) * actor_target
            metrics['imag_gradient_mix'] = mix
        else:
            raise NotImplementedError(self._config.imag_gradient)
        if not self._config.future_entropy and tf.greater(
                self._config.actor_entropy(), 0):
            actor_target += self._config.actor_entropy() * actor_ent[:-1]
        if not self._config.future_entropy and tf.greater(
                self._config.actor_state_entropy(), 0):
            actor_target += self._config.actor_state_entropy() * state_ent[:-1]
        actor_loss = -tf.reduce_mean(weights[:-1] * actor_target)
        return actor_loss, metrics
    def _update_slow_target(self):
        if self._config.slow_value_target or self._config.slow_actor_target:
            if self._updates % self._config.slow_target_update == 0:
                mix = self._config.slow_target_fraction
                for s, d in zip(self.value.variables, self._slow_value.variables):
                    d.assign(mix * s + (1 - mix) * d)
            self._updates.assign_add(1)
--- a/DreamerV2/networks.py
+++ b/DreamerV2/networks.py
@ -0,0 +1,465 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras import layers as tfkl
 from tensorflow_probability import distributions as tfd
 from tensorflow.keras.mixed_precision import experimental as prec
 import tools
 class RSSM(tools.Module):
    def __init__(
            self, stoch=30, deter=200, hidden=200, layers_input=1, layers_output=1,
            shared=False, discrete=False, act=tf.nn.elu, mean_act='none',
            std_act='softplus', min_std=0.1, cell='keras'):
        super().__init__()
        self._stoch = stoch
        self._deter = deter
        self._hidden = hidden
        self._min_std = min_std
        self._layers_input = layers_input
        self._layers_output = layers_output
        self._shared = shared
        self._discrete = discrete
        self._act = act
        self._mean_act = mean_act
        self._std_act = std_act
        self._embed = None
        if cell == 'gru':
            self._cell = tfkl.GRUCell(self._deter)
        elif cell == 'gru_layer_norm':
            self._cell = GRUCell(self._deter, norm=True)
        else:
            raise NotImplementedError(cell)
    def initial(self, batch_size):
        dtype = prec.global_policy().compute_dtype
        if self._discrete:
            state = dict(
                logit=tf.zeros(
                    [batch_size, self._stoch, self._discrete], dtype),
                stoch=tf.zeros(
                    [batch_size, self._stoch, self._discrete], dtype),
                deter=self._cell.get_initial_state(None, batch_size, dtype))
        else:
            state = dict(
                mean=tf.zeros([batch_size, self._stoch], dtype),
                std=tf.zeros([batch_size, self._stoch], dtype),
                stoch=tf.zeros([batch_size, self._stoch], dtype),
                deter=self._cell.get_initial_state(None, batch_size, dtype))
        return state
    @tf.function
    def observe(self, embed, action, state=None):
        def swap(x): return tf.transpose(
            x, [1, 0] + list(range(2, len(x.shape))))
        if state is None:
            state = self.initial(tf.shape(action)[0])
        embed, action = swap(embed), swap(action)
        post, prior = tools.static_scan(
            lambda prev, inputs: self.obs_step(prev[0], *inputs),
            (action, embed), (state, state))
        post = {k: swap(v) for k, v in post.items()}
        prior = {k: swap(v) for k, v in prior.items()}
        return post, prior
    @tf.function
    def imagine(self, action, state=None):
        def swap(x): return tf.transpose(
            x, [1, 0] + list(range(2, len(x.shape))))
        if state is None:
            state = self.initial(tf.shape(action)[0])
        assert isinstance(state, dict), state
        action = swap(action)
        prior = tools.static_scan(self.img_step, action, state)
        prior = {k: swap(v) for k, v in prior.items()}
        return prior
    def get_feat(self, state):
        stoch = state['stoch']
        if self._discrete:
            shape = stoch.shape[:-2] + [self._stoch * self._discrete]
            stoch = tf.reshape(stoch, shape)
        return tf.concat([stoch, state['deter']], -1)
    def get_dist(self, state, dtype=None):
        if self._discrete:
            logit = state['logit']
            logit = tf.cast(logit, tf.float32)
            dist = tfd.Independent(tools.OneHotDist(logit), 1)
            if dtype != tf.float32:
                dist = tools.DtypeDist(dist, dtype or state['logit'].dtype)
        else:
            mean, std = state['mean'], state['std']
            if dtype:
                mean = tf.cast(mean, dtype)
                std = tf.cast(std, dtype)
            dist = tfd.MultivariateNormalDiag(mean, std)
        return dist
    @tf.function
    def obs_step(self, prev_state, prev_action, embed, sample=True):
        if not self._embed:
            self._embed = embed.shape[-1]
        prior = self.img_step(prev_state, prev_action, None, sample)
        if self._shared:
            post = self.img_step(prev_state, prev_action, embed, sample)
        else:
            x = tf.concat([prior['deter'], embed], -1)
            for i in range(self._layers_output):
                x = self.get(f'obi{i}', tfkl.Dense, self._hidden, self._act)(x)
            stats = self._suff_stats_layer('obs', x)
            if sample:
                stoch = self.get_dist(stats).sample()
            else:
                stoch = self.get_dist(stats).mode()
            post = {'stoch': stoch, 'deter': prior['deter'], **stats}
        return post, prior
    @tf.function
    def img_step(self, prev_state, prev_action, embed=None, sample=True):
        prev_stoch = prev_state['stoch']
        if self._discrete:
            shape = prev_stoch.shape[:-2] + [self._stoch * self._discrete]
            prev_stoch = tf.reshape(prev_stoch, shape)
        if self._shared:
            if embed is None:
                shape = prev_action.shape[:-1] + [self._embed]
                embed = tf.zeros(shape, prev_action.dtype)
            x = tf.concat([prev_stoch, prev_action, embed], -1)
        else:
            x = tf.concat([prev_stoch, prev_action], -1)
        for i in range(self._layers_input):
            x = self.get(f'ini{i}', tfkl.Dense, self._hidden, self._act)(x)
        x, deter = self._cell(x, [prev_state['deter']])
        deter = deter[0]  # Keras wraps the state in a list.
        for i in range(self._layers_output):
            x = self.get(f'imo{i}', tfkl.Dense, self._hidden, self._act)(x)
        stats = self._suff_stats_layer('ims', x)
        if sample:
            stoch = self.get_dist(stats).sample()
        else:
            stoch = self.get_dist(stats).mode()
        prior = {'stoch': stoch, 'deter': deter, **stats}
        return prior
    def _suff_stats_layer(self, name, x):
        if self._discrete:
            x = self.get(name, tfkl.Dense, self._stoch *
                         self._discrete, None)(x)
            logit = tf.reshape(x, x.shape[:-1] + [self._stoch, self._discrete])
            return {'logit': logit}
        else:
            x = self.get(name, tfkl.Dense, 2 * self._stoch, None)(x)
            mean, std = tf.split(x, 2, -1)
            mean = {
                'none': lambda: mean,
                'tanh5': lambda: 5.0 * tf.math.tanh(mean / 5.0),
            }[self._mean_act]()
            std = {
                'softplus': lambda: tf.nn.softplus(std),
                'abs': lambda: tf.math.abs(std + 1),
                'sigmoid': lambda: tf.nn.sigmoid(std),
                'sigmoid2': lambda: 2 * tf.nn.sigmoid(std / 2),
            }[self._std_act]()
            std = std + self._min_std
            return {'mean': mean, 'std': std}
    def kl_loss(self, post, prior, balance, free, scale):
        kld = tfd.kl_divergence
        def dist(x): return self.get_dist(x, tf.float32)
        if balance == 0.5:
            value = kld(dist(prior), dist(post))
            loss = tf.reduce_mean(tf.maximum(value, free))
        else:
            def sg(x): return tf.nest.map_structure(tf.stop_gradient, x)
            value = kld(dist(prior), dist(sg(post)))
            pri = tf.reduce_mean(value)
            pos = tf.reduce_mean(kld(dist(sg(prior)), dist(post)))
            pri, pos = tf.maximum(pri, free), tf.maximum(pos, free)
            loss = balance * pri + (1 - balance) * pos
        loss *= scale
        return loss, value
 class ConvEncoder(tools.Module):
    def __init__(
            self, depth=32, act=tf.nn.relu, kernels=(4, 4, 4, 4)):
        self._act = act
        self._depth = depth
        self._kernels = kernels
    def __call__(self, obs):
        kwargs = dict(strides=2, activation=self._act)
        Conv = tfkl.Conv2D
        x = tf.reshape(obs['image'], (-1,) + tuple(obs['image'].shape[-3:]))
        x = self.get('h1', Conv, 1 * self._depth,
                     self._kernels[0], **kwargs)(x)
        x = self.get('h2', Conv, 2 * self._depth,
                     self._kernels[1], **kwargs)(x)
        x = self.get('h3', Conv, 4 * self._depth,
                     self._kernels[2], **kwargs)(x)
        x = self.get('h4', Conv, 8 * self._depth,
                     self._kernels[3], **kwargs)(x)
        x = tf.reshape(x, [x.shape[0], np.prod(x.shape[1:])])
        shape = tf.concat([tf.shape(obs['image'])[:-3], [x.shape[-1]]], 0)
        return tf.reshape(x, shape)
 class ConvDecoder(tools.Module):
    def __init__(
            self, depth=32, act=tf.nn.relu, shape=(64, 64, 3), kernels=(5, 5, 6, 6),
            thin=True):
        self._act = act
        self._depth = depth
        self._shape = shape
        self._kernels = kernels
        self._thin = thin
    def __call__(self, features, dtype=None):
        kwargs = dict(strides=2, activation=self._act)
        ConvT = tfkl.Conv2DTranspose
        if self._thin:
            x = self.get('h1', tfkl.Dense, 32 * self._depth, None)(features)
            x = tf.reshape(x, [-1, 1, 1, 32 * self._depth])
        else:
            x = self.get('h1', tfkl.Dense, 128 * self._depth, None)(features)
            x = tf.reshape(x, [-1, 2, 2, 32 * self._depth])
        x = self.get('h2', ConvT, 4 * self._depth,
                     self._kernels[0], **kwargs)(x)
        x = self.get('h3', ConvT, 2 * self._depth,
                     self._kernels[1], **kwargs)(x)
        x = self.get('h4', ConvT, 1 * self._depth,
                     self._kernels[2], **kwargs)(x)
        x = self.get(
            'h5', ConvT, self._shape[-1], self._kernels[3], strides=2)(x)
        mean = tf.reshape(x, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        if dtype:
            mean = tf.cast(mean, dtype)
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape))
 class ConvDecoderMask(tools.Module):
    def __init__(
            self, depth=32, act=tf.nn.relu, shape=(64, 64, 3), kernels=(5, 5, 6, 6),
            thin=True):
        self._act = act
        self._depth = depth
        self._shape = shape
        self._kernels = kernels
        self._thin = thin
    def __call__(self, features, dtype=None):
        kwargs = dict(strides=2, activation=self._act)
        ConvT = tfkl.Conv2DTranspose
        if self._thin:
            x = self.get('h1', tfkl.Dense, 32 * self._depth, None)(features)
            x = tf.reshape(x, [-1, 1, 1, 32 * self._depth])
        else:
            x = self.get('h1', tfkl.Dense, 128 * self._depth, None)(features)
            x = tf.reshape(x, [-1, 2, 2, 32 * self._depth])
        x = self.get('h2', ConvT, 4 * self._depth,
                     self._kernels[0], **kwargs)(x)
        x = self.get('h3', ConvT, 2 * self._depth,
                     self._kernels[1], **kwargs)(x)
        x = self.get('h4', ConvT, 1 * self._depth,
                     self._kernels[2], **kwargs)(x)
        x = self.get(
            'h5', ConvT, 2 * self._shape[-1], self._kernels[3], strides=2)(x)
        mean, mask = tf.split(x, [self._shape[-1], self._shape[-1]], -1)
        mean = tf.reshape(mean, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        mask = tf.reshape(mask, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        if dtype:
            mean = tf.cast(mean, dtype)
            mask = tf.cast(mask, dtype)
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), mask
 class ConvDecoderMaskEnsemble(tools.Module):
    """
    ensemble two convdecoder with <Normal, mask> outputs
    NOTE: remove pred1/pred2 for maximum performance.
    """
    def __init__(self, decoder1, decoder2):
        self._decoder1 = decoder1
        self._decoder2 = decoder2
        self._shape = decoder1._shape
    def __call__(self, feat1, feat2, dtype=None):
        kwargs = dict(strides=1, activation=tf.nn.sigmoid)
        pred1, mask1 = self._decoder1(feat1, dtype)
        pred2, mask2 = self._decoder2(feat2, dtype)
        mean1 = pred1.submodules[0].loc
        mean2 = pred2.submodules[0].loc
        mask_feat = tf.concat([mask1, mask2], -1)
        mask = self.get('mask1', tfkl.Conv2D, 1, 1, **kwargs)(mask_feat)
        mask_use1 = mask
        mask_use2 = 1-mask
        mean = mean1 * tf.cast(mask_use1, mean1.dtype) + \
            mean2 * tf.cast(mask_use2, mean2.dtype)
        return tfd.Independent(tfd.Normal(mean, 1), len(self._shape)), pred1, pred2, tf.cast(mask_use1, mean1.dtype)
 class DenseHead(tools.Module):
    def __init__(
            self, shape, layers, units, act=tf.nn.elu, dist='normal', std=1.0):
        self._shape = (shape,) if isinstance(shape, int) else shape
        self._layers = layers
        self._units = units
        self._act = act
        self._dist = dist
        self._std = std
    def __call__(self, features, dtype=None):
        x = features
        for index in range(self._layers):
            x = self.get(f'h{index}', tfkl.Dense, self._units, self._act)(x)
        mean = self.get(f'hmean', tfkl.Dense, np.prod(self._shape))(x)
        mean = tf.reshape(mean, tf.concat(
            [tf.shape(features)[:-1], self._shape], 0))
        if self._std == 'learned':
            std = self.get(f'hstd', tfkl.Dense, np.prod(self._shape))(x)
            std = tf.nn.softplus(std) + 0.01
            std = tf.reshape(std, tf.concat(
                [tf.shape(features)[:-1], self._shape], 0))
        else:
            std = self._std
        if dtype:
            mean, std = tf.cast(mean, dtype), tf.cast(std, dtype)
        if self._dist == 'normal':
            return tfd.Independent(tfd.Normal(mean, std), len(self._shape))
        if self._dist == 'huber':
            return tfd.Independent(
                tools.UnnormalizedHuber(mean, std, 1.0), len(self._shape))
        if self._dist == 'binary':
            return tfd.Independent(tfd.Bernoulli(mean), len(self._shape))
        raise NotImplementedError(self._dist)
 class ActionHead(tools.Module):
    def __init__(
            self, size, layers, units, act=tf.nn.elu, dist='trunc_normal',
            init_std=0.0, min_std=0.1, action_disc=5, temp=0.1, outscale=0):
        # assert min_std <= 2
        self._size = size
        self._layers = layers
        self._units = units
        self._dist = dist
        self._act = act
        self._min_std = min_std
        self._init_std = init_std
        self._action_disc = action_disc
        self._temp = temp() if callable(temp) else temp
        self._outscale = outscale
    def __call__(self, features, dtype=None):
        x = features
        for index in range(self._layers):
            kw = {}
            if index == self._layers - 1 and self._outscale:
                kw['kernel_initializer'] = tf.keras.initializers.VarianceScaling(
                    self._outscale)
            x = self.get(f'h{index}', tfkl.Dense,
                         self._units, self._act, **kw)(x)
        if self._dist == 'tanh_normal':
            # https://www.desmos.com/calculator/rcmcf5jwe7
            x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x)
            if dtype:
                x = tf.cast(x, dtype)
            mean, std = tf.split(x, 2, -1)
            mean = tf.tanh(mean)
            std = tf.nn.softplus(std + self._init_std) + self._min_std
            dist = tfd.Normal(mean, std)
            dist = tfd.TransformedDistribution(dist, tools.TanhBijector())
            dist = tfd.Independent(dist, 1)
            dist = tools.SampleDist(dist)
        elif self._dist == 'tanh_normal_5':
            x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x)
            if dtype:
                x = tf.cast(x, dtype)
            mean, std = tf.split(x, 2, -1)
            mean = 5 * tf.tanh(mean / 5)
            std = tf.nn.softplus(std + 5) + 5
            dist = tfd.Normal(mean, std)
            dist = tfd.TransformedDistribution(dist, tools.TanhBijector())
            dist = tfd.Independent(dist, 1)
            dist = tools.SampleDist(dist)
        elif self._dist == 'normal':
            x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x)
            if dtype:
                x = tf.cast(x, dtype)
            mean, std = tf.split(x, 2, -1)
            std = tf.nn.softplus(std + self._init_std) + self._min_std
            dist = tfd.Normal(mean, std)
            dist = tfd.Independent(dist, 1)
        elif self._dist == 'normal_1':
            mean = self.get(f'hout', tfkl.Dense, self._size)(x)
            if dtype:
                mean = tf.cast(mean, dtype)
            dist = tfd.Normal(mean, 1)
            dist = tfd.Independent(dist, 1)
        elif self._dist == 'trunc_normal':
            # https://www.desmos.com/calculator/mmuvuhnyxo
            x = self.get(f'hout', tfkl.Dense, 2 * self._size)(x)
            x = tf.cast(x, tf.float32)
            mean, std = tf.split(x, 2, -1)
            mean = tf.tanh(mean)
            std = 2 * tf.nn.sigmoid(std / 2) + self._min_std
            dist = tools.SafeTruncatedNormal(mean, std, -1, 1)
            dist = tools.DtypeDist(dist, dtype)
            dist = tfd.Independent(dist, 1)
        elif self._dist == 'onehot':
            x = self.get(f'hout', tfkl.Dense, self._size)(x)
            x = tf.cast(x, tf.float32)
            dist = tools.OneHotDist(x, dtype=dtype)
            dist = tools.DtypeDist(dist, dtype)
        elif self._dist == 'onehot_gumble':
            x = self.get(f'hout', tfkl.Dense, self._size)(x)
            if dtype:
                x = tf.cast(x, dtype)
            temp = self._temp
            dist = tools.GumbleDist(temp, x, dtype=dtype)
        else:
            raise NotImplementedError(self._dist)
        return dist
 class GRUCell(tf.keras.layers.AbstractRNNCell):
    def __init__(self, size, norm=False, act=tf.tanh, update_bias=-1, **kwargs):
        super().__init__()
        self._size = size
        self._act = act
        self._norm = norm
        self._update_bias = update_bias
        self._layer = tfkl.Dense(3 * size, use_bias=norm is not None, **kwargs)
        if norm:
            self._norm = tfkl.LayerNormalization(dtype=tf.float32)
    @property
    def state_size(self):
        return self._size
    def call(self, inputs, state):
        state = state[0]  # Keras wraps the state in a list.
        parts = self._layer(tf.concat([inputs, state], -1))
        if self._norm:
            dtype = parts.dtype
            parts = tf.cast(parts, tf.float32)
            parts = self._norm(parts)
            parts = tf.cast(parts, dtype)
        reset, cand, update = tf.split(parts, 3, -1)
        reset = tf.nn.sigmoid(reset)
        cand = self._act(reset * cand)
        update = tf.nn.sigmoid(update + self._update_bias)
        output = update * cand + (1 - update) * state
        return output, [output]
--- a/DreamerV2/tools.py
+++ b/DreamerV2/tools.py
@ -0,0 +1,694 @@
 import datetime
 import io
 import json
 import pathlib
 import pickle
 import re
 import time
 import uuid
 import numpy as np
 import tensorflow as tf
 import tensorflow.compat.v1 as tf1
 import tensorflow_probability as tfp
 from tensorflow.keras.mixed_precision import experimental as prec
 from tensorflow_probability import distributions as tfd
 # Patch to ignore seed to avoid synchronization across GPUs.
 _orig_random_categorical = tf.random.categorical
 def random_categorical(*args, **kwargs):
  kwargs['seed'] = None
  return _orig_random_categorical(*args, **kwargs)
 tf.random.categorical = random_categorical
 # Patch to ignore seed to avoid synchronization across GPUs.
 _orig_random_normal = tf.random.normal
 def random_normal(*args, **kwargs):
  kwargs['seed'] = None
  return _orig_random_normal(*args, **kwargs)
 tf.random.normal = random_normal
 class AttrDict(dict):
  __setattr__ = dict.__setitem__
  __getattr__ = dict.__getitem__
 class Module(tf.Module):
  def save(self, filename):
    values = tf.nest.map_structure(lambda x: x.numpy(), self.variables)
    amount = len(tf.nest.flatten(values))
    count = int(sum(np.prod(x.shape) for x in tf.nest.flatten(values)))
    print(f'Save checkpoint with {amount} tensors and {count} parameters.')
    with pathlib.Path(filename).open('wb') as f:
      pickle.dump(values, f)
  def load(self, filename):
    with pathlib.Path(filename).open('rb') as f:
      values = pickle.load(f)
    amount = len(tf.nest.flatten(values))
    count = int(sum(np.prod(x.shape) for x in tf.nest.flatten(values)))
    print(f'Load checkpoint with {amount} tensors and {count} parameters.')
    tf.nest.map_structure(lambda x, y: x.assign(y), self.variables, values)
  def get(self, name, ctor, *args, **kwargs):
    # Create or get layer by name to avoid mentioning it in the constructor.
    if not hasattr(self, '_modules'):
      self._modules = {}
    if name not in self._modules:
      self._modules[name] = ctor(*args, **kwargs)
    return self._modules[name]
 def var_nest_names(nest):
  if isinstance(nest, dict):
    items = ' '.join(f'{k}:{var_nest_names(v)}' for k, v in nest.items())
    return '{' + items + '}'
  if isinstance(nest, (list, tuple)):
    items = ' '.join(var_nest_names(v) for v in nest)
    return '[' + items + ']'
  if hasattr(nest, 'name') and hasattr(nest, 'shape'):
    return nest.name + str(nest.shape).replace(', ', 'x')
  if hasattr(nest, 'shape'):
    return str(nest.shape).replace(', ', 'x')
  return '?'
 class Logger:
  def __init__(self, logdir, step):
    self._logdir = logdir
    self._writer = tf.summary.create_file_writer(str(logdir), max_queue=1000)
    self._last_step = None
    self._last_time = None
    self._scalars = {}
    self._images = {}
    self._videos = {}
    self.step = step
  def scalar(self, name, value):
    self._scalars[name] = float(value)
  def image(self, name, value):
    self._images[name] = np.array(value)
  def video(self, name, value):
    self._videos[name] = np.array(value)
  def write(self, fps=False):
    scalars = list(self._scalars.items())
    if fps:
      scalars.append(('fps', self._compute_fps(self.step)))
    print(f'[{self.step}]', ' / '.join(f'{k} {v:.1f}' for k, v in scalars))
    with (self._logdir / 'metrics.jsonl').open('a') as f:
      f.write(json.dumps({'step': self.step, ** dict(scalars)}) + '\n')
    with self._writer.as_default():
      for name, value in scalars:
        tf.summary.scalar('scalars/' + name, value, self.step)
      for name, value in self._images.items():
        tf.summary.image(name, value, self.step)
      for name, value in self._videos.items():
        video_summary(name, value, self.step)
    self._writer.flush()
    self._scalars = {}
    self._images = {}
    self._videos = {}
  def _compute_fps(self, step):
    if self._last_step is None:
      self._last_time = time.time()
      self._last_step = step
      return 0
    steps = step - self._last_step
    duration = time.time() - self._last_time
    self._last_time += duration
    self._last_step = step
    return steps / duration
 def graph_summary(writer, step, fn, *args):
  def inner(*args):
    tf.summary.experimental.set_step(step.numpy().item())
    with writer.as_default():
      fn(*args)
  return tf.numpy_function(inner, args, [])
 def video_summary(name, video, step=None, fps=20):
  name = name if isinstance(name, str) else name.decode('utf-8')
  if np.issubdtype(video.dtype, np.floating):
    video = np.clip(255 * video, 0, 255).astype(np.uint8)
  B, T, H, W, C = video.shape
  try:
    frames = video.transpose((1, 2, 0, 3, 4)).reshape((T, H, B * W, C))
    summary = tf1.Summary()
    image = tf1.Summary.Image(height=B * H, width=T * W, colorspace=C)
    image.encoded_image_string = encode_gif(frames, fps)
    summary.value.add(tag=name, image=image)
    tf.summary.experimental.write_raw_pb(summary.SerializeToString(), step)
  except (IOError, OSError) as e:
    print('GIF summaries require ffmpeg in $PATH.', e)
    frames = video.transpose((0, 2, 1, 3, 4)).reshape((1, B * H, T * W, C))
    tf.summary.image(name, frames, step)
 def encode_gif(frames, fps):
  from subprocess import Popen, PIPE
  h, w, c = frames[0].shape
  pxfmt = {1: 'gray', 3: 'rgb24'}[c]
  cmd = ' '.join([
      f'ffmpeg -y -f rawvideo -vcodec rawvideo',
      f'-r {fps:.02f} -s {w}x{h} -pix_fmt {pxfmt} -i - -filter_complex',
      f'[0:v]split[x][z];[z]palettegen[y];[x]fifo[x];[x][y]paletteuse',
      f'-r {fps:.02f} -f gif -'])
  proc = Popen(cmd.split(' '), stdin=PIPE, stdout=PIPE, stderr=PIPE)
  for image in frames:
    proc.stdin.write(image.tostring())
  out, err = proc.communicate()
  if proc.returncode:
    raise IOError('\n'.join([' '.join(cmd), err.decode('utf8')]))
  del proc
  return out
 def simulate(agent, envs, steps=0, episodes=0, state=None):
  # Initialize or unpack simulation state.
  if state is None:
    step, episode = 0, 0
    done = np.ones(len(envs), np.bool)
    length = np.zeros(len(envs), np.int32)
    obs = [None] * len(envs)
    agent_state = None
  else:
    step, episode, done, length, obs, agent_state = state
  while (steps and step < steps) or (episodes and episode < episodes):
    # Reset envs if necessary.
    if done.any():
      indices = [index for index, d in enumerate(done) if d]
      results = [envs[i].reset() for i in indices]
      for index, result in zip(indices, results):
        obs[index] = result
    # Step agents.
    obs = {k: np.stack([o[k] for o in obs]) for k in obs[0]}
    action, agent_state = agent(obs, done, agent_state)
    if isinstance(action, dict):
      action = [
          {k: np.array(action[k][i]) for k in action}
          for i in range(len(envs))]
    else:
      action = np.array(action)
    assert len(action) == len(envs)
    # Step envs.
    results = [e.step(a) for e, a in zip(envs, action)]
    obs, _, done = zip(*[p[:3] for p in results])
    obs = list(obs)
    done = np.stack(done)
    episode += int(done.sum())
    length += 1
    step += (done * length).sum()
    length *= (1 - done)
    # import pdb
    # pdb.set_trace()
  # Return new state to allow resuming the simulation.
  return (step - steps, episode - episodes, done, length, obs, agent_state)
 def save_episodes(directory, episodes):
  directory = pathlib.Path(directory).expanduser()
  directory.mkdir(parents=True, exist_ok=True)
  timestamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
  filenames = []
  for episode in episodes:
    identifier = str(uuid.uuid4().hex)
    length = len(episode['reward'])
    filename = directory / f'{timestamp}-{identifier}-{length}.npz'
    with io.BytesIO() as f1:
      np.savez_compressed(f1, **episode)
      f1.seek(0)
      with filename.open('wb') as f2:
        f2.write(f1.read())
    filenames.append(filename)
  return filenames
 def sample_episodes(episodes, length=None, balance=False, seed=0):
  random = np.random.RandomState(seed)
  while True:
    episode = random.choice(list(episodes.values()))
    if length:
      total = len(next(iter(episode.values())))
      available = total - length
      if available < 1:
        # print(f'Skipped short episode of length {available}.')
        continue
      if balance:
        index = min(random.randint(0, total), available)
      else:
        index = int(random.randint(0, available + 1))
      episode = {k: v[index: index + length] for k, v in episode.items()}
    yield episode
 def load_episodes(directory, limit=None):
  directory = pathlib.Path(directory).expanduser()
  episodes = {}
  total = 0
  for filename in reversed(sorted(directory.glob('*.npz'))):
    try:
      with filename.open('rb') as f:
        episode = np.load(f)
        episode = {k: episode[k] for k in episode.keys()}
    except Exception as e:
      print(f'Could not load episode: {e}')
      continue
    episodes[str(filename)] = episode
    total += len(episode['reward']) - 1
    if limit and total >= limit:
      break
  return episodes
 class DtypeDist:
  def __init__(self, dist, dtype=None):
    self._dist = dist
    self._dtype = dtype or prec.global_policy().compute_dtype
  @property
  def name(self):
    return 'DtypeDist'
  def __getattr__(self, name):
    return getattr(self._dist, name)
  def mean(self):
    return tf.cast(self._dist.mean(), self._dtype)
  def mode(self):
    return tf.cast(self._dist.mode(), self._dtype)
  def entropy(self):
    return tf.cast(self._dist.entropy(), self._dtype)
  def sample(self, *args, **kwargs):
    return tf.cast(self._dist.sample(*args, **kwargs), self._dtype)
 class SampleDist:
  def __init__(self, dist, samples=100):
    self._dist = dist
    self._samples = samples
  @property
  def name(self):
    return 'SampleDist'
  def __getattr__(self, name):
    return getattr(self._dist, name)
  def mean(self):
    samples = self._dist.sample(self._samples)
    return tf.reduce_mean(samples, 0)
  def mode(self):
    sample = self._dist.sample(self._samples)
    logprob = self._dist.log_prob(sample)
    return tf.gather(sample, tf.argmax(logprob))[0]
  def entropy(self):
    sample = self._dist.sample(self._samples)
    logprob = self.log_prob(sample)
    return -tf.reduce_mean(logprob, 0)
 class OneHotDist(tfd.OneHotCategorical):
  def __init__(self, logits=None, probs=None, dtype=None):
    self._sample_dtype = dtype or prec.global_policy().compute_dtype
    super().__init__(logits=logits, probs=probs)
  def mode(self):
    return tf.cast(super().mode(), self._sample_dtype)
  def sample(self, sample_shape=(), seed=None):
    # Straight through biased gradient estimator.
    sample = tf.cast(super().sample(sample_shape, seed), self._sample_dtype)
    probs = super().probs_parameter()
    while len(probs.shape) < len(sample.shape):
      probs = probs[None]
    sample += tf.cast(probs - tf.stop_gradient(probs), self._sample_dtype)
    return sample
 class GumbleDist(tfd.RelaxedOneHotCategorical):
  def __init__(self, temp, logits=None, probs=None, dtype=None):
    self._sample_dtype = dtype or prec.global_policy().compute_dtype
    self._exact = tfd.OneHotCategorical(logits=logits, probs=probs)
    super().__init__(temp, logits=logits, probs=probs)
  def mode(self):
    return tf.cast(self._exact.mode(), self._sample_dtype)
  def entropy(self):
    return tf.cast(self._exact.entropy(), self._sample_dtype)
  def sample(self, sample_shape=(), seed=None):
    return tf.cast(super().sample(sample_shape, seed), self._sample_dtype)
 class UnnormalizedHuber(tfd.Normal):
  def __init__(self, loc, scale, threshold=1, **kwargs):
    self._threshold = tf.cast(threshold, loc.dtype)
    super().__init__(loc, scale, **kwargs)
  def log_prob(self, event):
    return -(tf.math.sqrt(
        (event - self.mean()) ** 2 + self._threshold ** 2) - self._threshold)
 class SafeTruncatedNormal(tfd.TruncatedNormal):
  def __init__(self, loc, scale, low, high, clip=1e-6, mult=1):
    super().__init__(loc, scale, low, high)
    self._clip = clip
    self._mult = mult
  def sample(self, *args, **kwargs):
    event = super().sample(*args, **kwargs)
    if self._clip:
      clipped = tf.clip_by_value(
          event, self.low + self._clip, self.high - self._clip)
      event = event - tf.stop_gradient(event) + tf.stop_gradient(clipped)
    if self._mult:
      event *= self._mult
    return event
 class TanhBijector(tfp.bijectors.Bijector):
  def __init__(self, validate_args=False, name='tanh'):
    super().__init__(
        forward_min_event_ndims=0,
        validate_args=validate_args,
        name=name)
  def _forward(self, x):
    return tf.nn.tanh(x)
  def _inverse(self, y):
    dtype = y.dtype
    y = tf.cast(y, tf.float32)
    y = tf.where(
        tf.less_equal(tf.abs(y), 1.),
        tf.clip_by_value(y, -0.99999997, 0.99999997), y)
    y = tf.atanh(y)
    y = tf.cast(y, dtype)
    return y
  def _forward_log_det_jacobian(self, x):
    log2 = tf.math.log(tf.constant(2.0, dtype=x.dtype))
    return 2.0 * (log2 - x - tf.nn.softplus(-2.0 * x))
 def lambda_return(
    reward, value, pcont, bootstrap, lambda_, axis):
  # Setting lambda=1 gives a discounted Monte Carlo return.
  # Setting lambda=0 gives a fixed 1-step return.
  assert reward.shape.ndims == value.shape.ndims, (reward.shape, value.shape)
  if isinstance(pcont, (int, float)):
    pcont = pcont * tf.ones_like(reward)
  dims = list(range(reward.shape.ndims))
  dims = [axis] + dims[1:axis] + [0] + dims[axis + 1:]
  if axis != 0:
    reward = tf.transpose(reward, dims)
    value = tf.transpose(value, dims)
    pcont = tf.transpose(pcont, dims)
  if bootstrap is None:
    bootstrap = tf.zeros_like(value[-1])
  next_values = tf.concat([value[1:], bootstrap[None]], 0)
  inputs = reward + pcont * next_values * (1 - lambda_)
  returns = static_scan(
      lambda agg, cur: cur[0] + cur[1] * lambda_ * agg,
      (inputs, pcont), bootstrap, reverse=True)
  if axis != 0:
    returns = tf.transpose(returns, dims)
  return returns
 class Optimizer(tf.Module):
  def __init__(
      self, name, lr, eps=1e-4, clip=None, wd=None, wd_pattern=r'.*',
      opt='adam'):
    assert 0 <= wd < 1
    assert not clip or 1 <= clip
    self._name = name
    self._clip = clip
    self._wd = wd
    self._wd_pattern = wd_pattern
    self._opt = {
        'adam': lambda: tf.optimizers.Adam(lr, epsilon=eps),
        'nadam': lambda: tf.optimizers.Nadam(lr, epsilon=eps),
        'adamax': lambda: tf.optimizers.Adamax(lr, epsilon=eps),
        'sgd': lambda: tf.optimizers.SGD(lr),
        'momentum': lambda: tf.optimizers.SGD(lr, 0.9),
    }[opt]()
    self._mixed = (prec.global_policy().compute_dtype == tf.float16)
    if self._mixed:
      self._opt = prec.LossScaleOptimizer(self._opt, 'dynamic')
  @property
  def variables(self):
    return self._opt.variables()
  def __call__(self, tape, loss, modules, prefix=None):
    assert loss.dtype is tf.float32, self._name
    modules = modules if hasattr(modules, '__len__') else (modules,)
    varibs = tf.nest.flatten([module.variables for module in modules])
    count = sum(np.prod(x.shape) for x in varibs)
    print(f'Found {count} {self._name} parameters.')
    assert len(loss.shape) == 0, loss.shape
    tf.debugging.check_numerics(loss, self._name + '_loss')
    if self._mixed:
      with tape:
        loss = self._opt.get_scaled_loss(loss)
    grads = tape.gradient(loss, varibs)
    if self._mixed:
      grads = self._opt.get_unscaled_gradients(grads)
    norm = tf.linalg.global_norm(grads)
    if not self._mixed:
      tf.debugging.check_numerics(norm, self._name + '_norm')
    if self._clip:
      grads, _ = tf.clip_by_global_norm(grads, self._clip, norm)
    if self._wd:
      self._apply_weight_decay(varibs)
    self._opt.apply_gradients(zip(grads, varibs))
    metrics = {}
    if prefix:
      metrics[f'{prefix}/{self._name}_loss'] = loss
      metrics[f'{prefix}/{self._name}_grad_norm'] = norm
      if self._mixed:
        metrics[f'{prefix}/{self._name}_loss_scale'] = \
            self._opt.loss_scale._current_loss_scale
    else:
      metrics[f'{self._name}_loss'] = loss
      metrics[f'{self._name}_grad_norm'] = norm
      if self._mixed:
        metrics[f'{self._name}_loss_scale'] = \
            self._opt.loss_scale._current_loss_scale
    return metrics
  def _apply_weight_decay(self, varibs):
    nontrivial = (self._wd_pattern != r'.*')
    if nontrivial:
      print('Applied weight decay to variables:')
    for var in varibs:
      if re.search(self._wd_pattern, self._name + '/' + var.name):
        if nontrivial:
          print('- ' + self._name + '/' + var.name)
        var.assign((1 - self._wd) * var)
 def args_type(default):
  def parse_string(x):
    if default is None:
      return x
    if isinstance(default, bool):
      return bool(['False', 'True'].index(x))
    if isinstance(default, int):
      return float(x) if ('e' in x or '.' in x) else int(x)
    if isinstance(default, (list, tuple)):
      return tuple(args_type(default[0])(y) for y in x.split(','))
    return type(default)(x)
  def parse_object(x):
    if isinstance(default, (list, tuple)):
      return tuple(x)
    return x
  return lambda x: parse_string(x) if isinstance(x, str) else parse_object(x)
 def static_scan(fn, inputs, start, reverse=False):
  last = start
  outputs = [[] for _ in tf.nest.flatten(start)]
  indices = range(len(tf.nest.flatten(inputs)[0]))
  if reverse:
    indices = reversed(indices)
  for index in indices:
    inp = tf.nest.map_structure(lambda x: x[index], inputs)
    last = fn(last, inp)
    [o.append(l) for o, l in zip(outputs, tf.nest.flatten(last))]
  if reverse:
    outputs = [list(reversed(x)) for x in outputs]
  outputs = [tf.stack(x, 0) for x in outputs]
  return tf.nest.pack_sequence_as(start, outputs)
 def uniform_mixture(dist, dtype=None):
  if dist.batch_shape[-1] == 1:
    return tfd.BatchReshape(dist, dist.batch_shape[:-1])
  dtype = dtype or prec.global_policy().compute_dtype
  weights = tfd.Categorical(tf.zeros(dist.batch_shape, dtype))
  return tfd.MixtureSameFamily(weights, dist)
 def cat_mixture_entropy(dist):
  if isinstance(dist, tfd.MixtureSameFamily):
    probs = dist.components_distribution.probs_parameter()
  else:
    probs = dist.probs_parameter()
  return -tf.reduce_mean(
      tf.reduce_mean(probs, 2) *
      tf.math.log(tf.reduce_mean(probs, 2) + 1e-8), -1)
@tf.function
 def cem_planner(
    state, num_actions, horizon, proposals, topk, iterations, imagine,
    objective):
  dtype = prec.global_policy().compute_dtype
  B, P = list(state.values())[0].shape[0], proposals
  H, A = horizon, num_actions
  flat_state = {k: tf.repeat(v, P, 0) for k, v in state.items()}
  mean = tf.zeros((B, H, A), dtype)
  std = tf.ones((B, H, A), dtype)
  for _ in range(iterations):
    proposals = tf.random.normal((B, P, H, A), dtype=dtype)
    proposals = proposals * std[:, None] + mean[:, None]
    proposals = tf.clip_by_value(proposals, -1, 1)
    flat_proposals = tf.reshape(proposals, (B * P, H, A))
    states = imagine(flat_proposals, flat_state)
    scores = objective(states)
    scores = tf.reshape(tf.reduce_sum(scores, -1), (B, P))
    _, indices = tf.math.top_k(scores, topk, sorted=False)
    best = tf.gather(proposals, indices, axis=1, batch_dims=1)
    mean, var = tf.nn.moments(best, 1)
    std = tf.sqrt(var + 1e-6)
  return mean[:, 0, :]
@tf.function
 def grad_planner(
    state, num_actions, horizon, proposals, iterations, imagine, objective,
    kl_scale, step_size):
  dtype = prec.global_policy().compute_dtype
  B, P = list(state.values())[0].shape[0], proposals
  H, A = horizon, num_actions
  flat_state = {k: tf.repeat(v, P, 0) for k, v in state.items()}
  mean = tf.zeros((B, H, A), dtype)
  rawstd = 0.54 * tf.ones((B, H, A), dtype)
  for _ in range(iterations):
    proposals = tf.random.normal((B, P, H, A), dtype=dtype)
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      tape.watch(mean)
      tape.watch(rawstd)
      std = tf.nn.softplus(rawstd)
      proposals = proposals * std[:, None] + mean[:, None]
      proposals = (
          tf.stop_gradient(tf.clip_by_value(proposals, -1, 1)) +
          proposals - tf.stop_gradient(proposals))
      flat_proposals = tf.reshape(proposals, (B * P, H, A))
      states = imagine(flat_proposals, flat_state)
      scores = objective(states)
      scores = tf.reshape(tf.reduce_sum(scores, -1), (B, P))
      div = tfd.kl_divergence(
          tfd.Normal(mean, std),
          tfd.Normal(tf.zeros_like(mean), tf.ones_like(std)))
      elbo = tf.reduce_sum(scores) - kl_scale * div
      elbo /= tf.cast(tf.reduce_prod(tf.shape(scores)), dtype)
    grad_mean, grad_rawstd = tape.gradient(elbo, [mean, rawstd])
    e, v = tf.nn.moments(grad_mean, [1, 2], keepdims=True)
    grad_mean /= tf.sqrt(e * e + v + 1e-4)
    e, v = tf.nn.moments(grad_rawstd, [1, 2], keepdims=True)
    grad_rawstd /= tf.sqrt(e * e + v + 1e-4)
    mean = tf.clip_by_value(mean + step_size * grad_mean, -1, 1)
    rawstd = rawstd + step_size * grad_rawstd
  return mean[:, 0, :]
 class Every:
  def __init__(self, every):
    self._every = every
    self._last = None
  def __call__(self, step):
    if not self._every:
      return False
    if self._last is None:
      self._last = step
      return True
    if step >= self._last + self._every:
      self._last += self._every
      return True
    return False
 class Once:
  def __init__(self):
    self._once = True
  def __call__(self):
    if self._once:
      self._once = False
      return True
    return False
 class Until:
  def __init__(self, until):
    self._until = until
  def __call__(self, step):
    if not self._until:
      return True
    return step < self._until
 def schedule(string, step):
  try:
    return float(string)
  except ValueError:
    step = tf.cast(step, tf.float32)
    match = re.match(r'linear\((.+),(.+),(.+)\)', string)
    if match:
      initial, final, duration = [float(group) for group in match.groups()]
      mix = tf.clip_by_value(step / duration, 0, 1)
      return (1 - mix) * initial + mix * final
    match = re.match(r'warmup\((.+),(.+)\)', string)
    if match:
      warmup, value = [float(group) for group in match.groups()]
      scale = tf.clip_by_value(step / warmup, 0, 1)
      return scale * value
    match = re.match(r'exp\((.+),(.+),(.+)\)', string)
    if match:
      initial, final, halflife = [float(group) for group in match.groups()]
      return (initial - final) * 0.5 ** (step / halflife) + final
    raise NotImplementedError(string)
--- a/DreamerV2/wrappers.py
+++ b/DreamerV2/wrappers.py
@ -0,0 +1,280 @@
 import threading
 import gym
 import numpy as np
 class DeepMindControl:
  def __init__(self, name, action_repeat=1, size=(64, 64), camera=None):
    domain, task = name.split('_', 1)
    if domain == 'cup':  # Only domain with multiple words.
      domain = 'ball_in_cup'
    if isinstance(domain, str):
      from dm_control import suite
      self._env = suite.load(domain, task)
    else:
      assert task is None
      self._env = domain()
    self._action_repeat = action_repeat
    self._size = size
    if camera is None:
      camera = dict(quadruped=2).get(domain, 0)
    self._camera = camera
  @property
  def observation_space(self):
    spaces = {}
    for key, value in self._env.observation_spec().items():
      spaces[key] = gym.spaces.Box(
          -np.inf, np.inf, value.shape, dtype=np.float32)
    spaces['image'] = gym.spaces.Box(
        0, 255, self._size + (3,), dtype=np.uint8)
    return gym.spaces.Dict(spaces)
  @property
  def action_space(self):
    spec = self._env.action_spec()
    return gym.spaces.Box(spec.minimum, spec.maximum, dtype=np.float32)
  def step(self, action):
    assert np.isfinite(action).all(), action
    reward = 0
    for _ in range(self._action_repeat):
      time_step = self._env.step(action)
      reward += time_step.reward or 0
      if time_step.last():
        break
    obs = dict(time_step.observation)
    obs['image'] = self.render()
    done = time_step.last()
    info = {'discount': np.array(time_step.discount, np.float32)}
    return obs, reward, done, info
  def reset(self):
    time_step = self._env.reset()
    obs = dict(time_step.observation)
    obs['image'] = self.render()
    return obs
  def render(self, *args, **kwargs):
    if kwargs.get('mode', 'rgb_array') != 'rgb_array':
      raise ValueError("Only render mode 'rgb_array' is supported.")
    return self._env.physics.render(*self._size, camera_id=self._camera)
 class Atari:
  LOCK = threading.Lock()
  def __init__(
      self, name, action_repeat=4, size=(84, 84), grayscale=True, noops=30,
      life_done=False, sticky_actions=True, all_actions=False):
    assert size[0] == size[1]
    import gym.wrappers
    import gym.envs.atari
    with self.LOCK:
      env = gym.envs.atari.AtariEnv(
          game=name, obs_type='image', frameskip=1,
          repeat_action_probability=0.25 if sticky_actions else 0.0,
          full_action_space=all_actions)
    # Avoid unnecessary rendering in inner env.
    env._get_obs = lambda: None
    # Tell wrapper that the inner env has no action repeat.
    env.spec = gym.envs.registration.EnvSpec('NoFrameskip-v0')
    env = gym.wrappers.AtariPreprocessing(
        env, noops, action_repeat, size[0], life_done, grayscale)
    self._env = env
    self._grayscale = grayscale
  @property
  def observation_space(self):
    return gym.spaces.Dict({
        'image': self._env.observation_space,
        'ram': gym.spaces.Box(0, 255, (128,), np.uint8),
    })
  @property
  def action_space(self):
    return self._env.action_space
  def close(self):
    return self._env.close()
  def reset(self):
    with self.LOCK:
      image = self._env.reset()
    if self._grayscale:
      image = image[..., None]
    obs = {'image': image, 'ram': self._env.env._get_ram()}
    return obs
  def step(self, action):
    image, reward, done, info = self._env.step(action)
    if self._grayscale:
      image = image[..., None]
    obs = {'image': image, 'ram': self._env.env._get_ram()}
    return obs, reward, done, info
  def render(self, mode):
    return self._env.render(mode)
 class CollectDataset:
  def __init__(self, env, callbacks=None, precision=32):
    self._env = env
    self._callbacks = callbacks or ()
    self._precision = precision
    self._episode = None
  def __getattr__(self, name):
    return getattr(self._env, name)
  def step(self, action):
    obs, reward, done, info = self._env.step(action)
    obs = {k: self._convert(v) for k, v in obs.items()}
    transition = obs.copy()
    transition['action'] = action
    transition['reward'] = reward
    transition['discount'] = info.get('discount', np.array(1 - float(done)))
    self._episode.append(transition)
    if done:
      episode = {k: [t[k] for t in self._episode] for k in self._episode[0]}
      episode = {k: self._convert(v) for k, v in episode.items()}
      info['episode'] = episode
      for callback in self._callbacks:
        callback(episode)
    return obs, reward, done, info
  def reset(self):
    obs = self._env.reset()
    transition = obs.copy()
    transition['action'] = np.zeros(self._env.action_space.shape)
    transition['reward'] = 0.0
    transition['discount'] = 1.0
    self._episode = [transition]
    return obs
  def _convert(self, value):
    value = np.array(value)
    if np.issubdtype(value.dtype, np.floating):
      dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self._precision]
    elif np.issubdtype(value.dtype, np.signedinteger):
      dtype = {16: np.int16, 32: np.int32, 64: np.int64}[self._precision]
    elif np.issubdtype(value.dtype, np.uint8):
      dtype = np.uint8
    else:
      raise NotImplementedError(value.dtype)
    return value.astype(dtype)
 class TimeLimit:
  def __init__(self, env, duration):
    self._env = env
    self._duration = duration
    self._step = None
  def __getattr__(self, name):
    return getattr(self._env, name)
  def step(self, action):
    assert self._step is not None, 'Must reset environment.'
    obs, reward, done, info = self._env.step(action)
    self._step += 1
    if self._step >= self._duration:
      done = True
      if 'discount' not in info:
        info['discount'] = np.array(1.0).astype(np.float32)
      self._step = None
    return obs, reward, done, info
  def reset(self):
    self._step = 0
    return self._env.reset()
 class NormalizeActions:
  def __init__(self, env):
    self._env = env
    self._mask = np.logical_and(
        np.isfinite(env.action_space.low),
        np.isfinite(env.action_space.high))
    self._low = np.where(self._mask, env.action_space.low, -1)
    self._high = np.where(self._mask, env.action_space.high, 1)
  def __getattr__(self, name):
    return getattr(self._env, name)
  @property
  def action_space(self):
    low = np.where(self._mask, -np.ones_like(self._low), self._low)
    high = np.where(self._mask, np.ones_like(self._low), self._high)
    return gym.spaces.Box(low, high, dtype=np.float32)
  def step(self, action):
    original = (action + 1) / 2 * (self._high - self._low) + self._low
    original = np.where(self._mask, original, action)
    return self._env.step(original)
 class OneHotAction:
  def __init__(self, env):
    assert isinstance(env.action_space, gym.spaces.Discrete)
    self._env = env
    self._random = np.random.RandomState()
  def __getattr__(self, name):
    return getattr(self._env, name)
  @property
  def action_space(self):
    shape = (self._env.action_space.n,)
    space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32)
    space.sample = self._sample_action
    return space
  def step(self, action):
    index = np.argmax(action).astype(int)
    reference = np.zeros_like(action)
    reference[index] = 1
    if not np.allclose(reference, action):
      raise ValueError(f'Invalid one-hot action:\n{action}')
    return self._env.step(index)
  def reset(self):
    return self._env.reset()
  def _sample_action(self):
    actions = self._env.action_space.n
    index = self._random.randint(0, actions)
    reference = np.zeros(actions, dtype=np.float32)
    reference[index] = 1.0
    return reference
 class RewardObs:
  def __init__(self, env):
    self._env = env
  def __getattr__(self, name):
    return getattr(self._env, name)
  @property
  def observation_space(self):
    spaces = self._env.observation_space.spaces
    assert 'reward' not in spaces
    spaces['reward'] = gym.spaces.Box(-np.inf, np.inf, dtype=np.float32)
    return gym.spaces.Dict(spaces)
  def step(self, action):
    obs, reward, done, info = self._env.step(action)
    obs['reward'] = reward
    return obs, reward, done, info
  def reset(self):
    obs = self._env.reset()
    obs['reward'] = 0.0
    return obs
--- a/README.md
+++ b/README.md
@ -0,0 +1,126 @@
 # Learning Task Informed Abstractions (TIA)
 <sub>Left to right: Raw Observation, Dreamer, Joint of TIA, Task Stream of TIA, Distractor Stream of TIA</sub>
 ![](imgs/gt.gif) ![](imgs/pred.gif) ![](imgs/joint.gif) ![](imgs/main.gif) ![](imgs/disen.gif) 
 This code base contains a minimal modification over [Dreamer](https://danijar.com/project/dreamer/)/[DreamerV2](https://danijar.com/project/dreamerv2/) to learn disentangled world models, presented in:
 **Learning Task Informed Abstractions**
 Xiang Fu*, Ge Yang*, Pulkit Agrawal, Tommi Jaakkola
 ICML 2021 [[website]](https://xiangfu.co/tia) [[paper]](https://arxiv.org/abs/2106.15612)
 The directory [Dreamer](./Dreamer) contains code for running DMC experiments. The directory [DreamerV2](./DreamerV2) contains code for running Atari experiments. This implementation is tested with Python 3.6, Tensorflow 2.3.1 and CUDA 10.1. The training/evaluation metrics used for producing the figures in the paper can be downloaded from [this Google Drive link](https://drive.google.com/file/d/1wvSp9Q7r2Ah5xRE_x3nJy-uwLkjF2RgX/view?usp=sharing).
 If you find this code useful, please consider citing:
 ```
@inproceedings{fu2021learning,
    title={Learning Task Informed Abstractions},
    author={Fu, Xiang and Yang, Ge and Agrawal, Pulkit and Jaakkola, Tommi},
    booktitle={ICML},
    year={2021}
 }
 ```
 ## Getting started
 Get dependencies:
 ```sh
 pip3 install --user tensorflow-gpu==2.3.1
 pip3 install --user tensorflow_probability==0.11.0
 pip3 install --user gym
 pip3 install --user pandas
 pip3 install --user matplotlib
 pip3 install --user ruamel.yaml
 pip3 install --user scikit-image
 pip3 install --user git+git://github.com/deepmind/dm_control.git
 pip3 install --user 'gym[atari]'
 ```
 You will need an active Mujoco license for running DMC experiments. 
 ## Running DMC experiments with distracting background
 Code for running DMC experiments is under the directory [Dreamer](./Dreamer).
 To run DMC experiments with distracting video backgrounds, you can download a small set of 16 videos (videos with names starting with ''A'' in the Kinetics 400 dataset's `driving_car` class) from [this Google Drive link](https://drive.google.com/file/d/1f-ER2XnhpvQeGjlJaoGRiLR0oEjn6Le_/view?usp=sharing), which is used for producing Figure 9(a) in the paper's appendix.
 To replicate the setup of [DBC](https://github.com/facebookresearch/deep_bisim4control) and use more background videos, first download the Kinetics 400 dataset and grab the `driving_car` label from the train dataset. Use the repo:
 [https://github.com/Showmax/kinetics-downloader](https://github.com/Showmax/kinetics-downloader) 
 to download the dataset.
 Train the agent:
 ```sh
 python run.py --method dreamer --configs dmc --task dmc_cheetah_run_driving --logdir ~/logdir --video_dir VIDPATH
 ```
 `VIDPATH` should contains `*.mp4` video files. (if you used the above repo to download the Kinetics videos, you should set `VIDPATH` to `PATH_TO_REPO/kinetics-downloader/dataset/train/driving_car`)
 Choose method from:
 ```
 [dreamer, tia, inverse]
 ```
 corresponding to the original Dreamer, TIA, and representation learned with an inverse model as described in Section 4.2 of the paper.
 Choose environment + distraction (e.g. `dmc_cheetah_run_driving`):
 ```
 dmc_{domain}_{task}_{distraction}
 ```
 where {domain} (e.g., cheetah, walker, hopper, etc.) and {task} (e.g., run, walk, stand, etc.) are from the DeepMind Control Suite, and distraction can be chosen from:
 ```
 [none, noise, driving]
 ```
 where each option uses different backgrounds:
 ```
 none: default (no) background
 noise: white noise background
 driving: natural videos from the ''driving car'' class as background
 ```
 ## Running Atari experiments
 Code for running Atari experiments is under the directory [DreamerV2](./DreamerV2).
 Train the agent with the game Demon Attack:
 ```sh
 python dreamer.py --logdir ~/logdir/atari_demon_attack/TIA/1 \
    --configs defaults atari --task atari_demon_attack
 ```
 ## Monitoring results
 Both DMC and Atari experiments log with tensorboard by default. The decomposition of the two streams of TIA is visualized in `.gif` animation. Access tensorboard with the command:
 ```sh
 tensorboard --logdir LOGDIR
 ```
 ## Reference
 We modify [Dreamer](https://github.com/danijar/dreamer) for DMC environments and [DreamerV2](https://github.com/danijar/dreamerv2) for Atari games. Thanks Danijar for releasing his very clean implementation! Utilities such as 
 - Logging with Tensorboard/JSON line files 
 - debugging with the `debug` flag
 - mixed precision training
 are the same as in the respective original implementations.
--- a/imgs/disen.gif
+++ b/imgs/disen.gif
--- a/imgs/gt.gif
+++ b/imgs/gt.gif
--- a/imgs/joint.gif
+++ b/imgs/joint.gif
--- a/imgs/main.gif
+++ b/imgs/main.gif
--- a/imgs/pred.gif
+++ b/imgs/pred.gif