tia/Dreamer/dreamers.py

import tools
import models
from tensorflow_probability import distributions as tfd
from tensorflow.keras.mixed_precision import experimental as prec
import tensorflow as tf
import numpy as np
import collections
import functools
import json
import time

from env_tools import preprocess, count_steps

def load_dataset(directory, config):
    episode = next(tools.load_episodes(directory, 1))
    types = {k: v.dtype for k, v in episode.items()}
    shapes = {k: (None,) + v.shape[1:] for k, v in episode.items()}

    def generator(): return tools.load_episodes(
        directory, config.train_steps, config.batch_length,
        config.dataset_balance)
    dataset = tf.data.Dataset.from_generator(generator, types, shapes)
    dataset = dataset.batch(config.batch_size, drop_remainder=True)
    dataset = dataset.map(functools.partial(preprocess, config=config))
    dataset = dataset.prefetch(10)
    return dataset

class Dreamer(tools.Module):

    def __init__(self, config, datadir, actspace, writer):
        self._c = config
        self._actspace = actspace
        self._actdim = actspace.n if hasattr(
            actspace, 'n') else actspace.shape[0]
        self._writer = writer
        self._random = np.random.RandomState(config.seed)
        self._should_pretrain = tools.Once()
        self._should_train = tools.Every(config.train_every)
        self._should_log = tools.Every(config.log_every)
        self._last_log = None
        self._last_time = time.time()
        self._metrics = collections.defaultdict(tf.metrics.Mean)
        self._metrics['expl_amount']  # Create variable for checkpoint.
        self._float = prec.global_policy().compute_dtype
        self._strategy = tf.distribute.MirroredStrategy()
        with tf.device('cpu:0'):
            self._step = tf.Variable(count_steps(
                datadir, config), dtype=tf.int64)
        with self._strategy.scope():
            self._dataset = iter(self._strategy.experimental_distribute_dataset(
                load_dataset(datadir, self._c)))
        self._build_model()

    def __call__(self, obs, reset, state=None, training=True):
        step = self._step.numpy().item()
        tf.summary.experimental.set_step(step)
        if state is not None and reset.any():
            mask = tf.cast(1 - reset, self._float)[:, None]
            state = tf.nest.map_structure(lambda x: x * mask, state)
        if self._should_train(step):
            log = self._should_log(step)
            n = self._c.pretrain if self._should_pretrain() else self._c.train_steps
            print(f'Training for {n} steps.')
            with self._strategy.scope():
                for train_step in range(n):
                    log_images = self._c.log_images and log and train_step == 0
                    self.train(next(self._dataset), log_images)
            if log:
                self._write_summaries()
        action, state = self.policy(obs, state, training)
        if training:
            self._step.assign_add(len(reset) * self._c.action_repeat)
        return action, state

    @tf.function
    def policy(self, obs, state, training):
        if state is None:
            latent = self._dynamics.initial(len(obs['image']))
            action = tf.zeros((len(obs['image']), self._actdim), self._float)
        else:
            latent, action = state
        embed = self._encode(preprocess(obs, self._c))
        latent, _ = self._dynamics.obs_step(latent, action, embed)
        feat = self._dynamics.get_feat(latent)
        if training:
            action = self._actor(feat).sample()
        else:
            action = self._actor(feat).mode()
        action = self._exploration(action, training)
        state = (latent, action)
        return action, state

    def load(self, filename):
        super().load(filename)
        self._should_pretrain()

    @tf.function()
    def train(self, data, log_images=False):
        self._strategy.experimental_run_v2(
            self._train, args=(data, log_images))

    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)

            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)

            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale

            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)

            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm,
                    actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)

    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._decode = models.ConvDecoder(
            self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)
        model_modules = [self._encode, self._dynamics,
                         self._decode, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))

    def _exploration(self, action, training):
        if training:
            amount = self._c.expl_amount
            if self._c.expl_decay:
                amount *= 0.5 ** (tf.cast(self._step,
                                          tf.float32) / self._c.expl_decay)
            if self._c.expl_min:
                amount = tf.maximum(self._c.expl_min, amount)
            self._metrics['expl_amount'].update_state(amount)
        elif self._c.eval_noise:
            amount = self._c.eval_noise
        else:
            return action
        if self._c.expl == 'additive_gaussian':
            return tf.clip_by_value(tfd.Normal(action, amount).sample(), -1, 1)
        if self._c.expl == 'completely_random':
            return tf.random.uniform(action.shape, -1, 1)
        if self._c.expl == 'epsilon_greedy':
            indices = tfd.Categorical(0 * action).sample()
            # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
            return tf.where(
                tf.random.uniform(action.shape[:1], 0, 1) < amount,
                tf.one_hot(indices, action.shape[-1], dtype=self._float),
                action)
        raise NotImplementedError(self._c.expl)

    def _imagine_ahead(self, post):
        if self._c.pcont:  # Last step could be terminal.
            post = {k: v[:, :-1] for k, v in post.items()}

        def flatten(x): return tf.reshape(x, [-1] + list(x.shape[2:]))
        start = {k: flatten(v) for k, v in post.items()}

        def policy(state): return self._actor(
            tf.stop_gradient(self._dynamics.get_feat(state))).sample()
        states = tools.static_scan(
            lambda prev, _: self._dynamics.img_step(prev, policy(prev)),
            tf.range(self._c.horizon), start)
        imag_feat = self._dynamics.get_feat(states)
        return imag_feat

    def _scalar_summaries(
            self, data, feat, prior_dist, post_dist, likes, div,
            model_loss, value_loss, actor_loss, model_norm, value_norm,
            actor_norm):
        self._metrics['model_grad_norm'].update_state(model_norm)
        self._metrics['value_grad_norm'].update_state(value_norm)
        self._metrics['actor_grad_norm'].update_state(actor_norm)
        self._metrics['prior_ent'].update_state(prior_dist.entropy())
        self._metrics['post_ent'].update_state(post_dist.entropy())
        for name, logprob in likes.items():
            self._metrics[name + '_loss'].update_state(-logprob)
        self._metrics['div'].update_state(div)
        self._metrics['model_loss'].update_state(model_loss)
        self._metrics['value_loss'].update_state(value_loss)
        self._metrics['actor_loss'].update_state(actor_loss)
        self._metrics['action_ent'].update_state(self._actor(feat).entropy())

    def _image_summaries(self, data, embed, image_pred):
        truth = data['image'][:6] + 0.5
        recon = image_pred.mode()[:6]
        init, _ = self._dynamics.observe(embed[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        prior = self._dynamics.imagine(data['action'][:6, 5:], init)
        openl = self._decode(self._dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'agent/openl', openl)

    def image_summary_from_data(self, data):
        truth = data['image'][:6] + 0.5
        embed = self._encode(data)
        post, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        feat = self._dynamics.get_feat(post)
        init = {k: v[:, -1] for k, v in post.items()}
        recon = self._decode(feat).mode()[:6]
        prior = self._dynamics.imagine(data['action'][:6, 5:], init)
        openl = self._decode(self._dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'agent/eval_openl', openl)

    def _write_summaries(self):
        step = int(self._step.numpy())
        metrics = [(k, float(v.result())) for k, v in self._metrics.items()]
        if self._last_log is not None:
            duration = time.time() - self._last_time
            self._last_time += duration
            metrics.append(('fps', (step - self._last_log) / duration))
        self._last_log = step
        [m.reset_states() for m in self._metrics.values()]
        with (self._c.logdir / 'metrics.jsonl').open('a') as f:
            f.write(json.dumps({'step': step, **dict(metrics)}) + '\n')
        [tf.summary.scalar('agent/' + k, m) for k, m in metrics]
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics))
        self._writer.flush()


class SeparationDreamer(Dreamer):

    def __init__(self, config, datadir, actspace, writer):
        self._metrics_disen = collections.defaultdict(tf.metrics.Mean)
        self._metrics_disen['expl_amount']
        super().__init__(config, datadir, actspace, writer)

    def _train(self, data, log_images):
        with tf.GradientTape(persistent=True) as model_tape:

            # main
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)

            # disen
            embed_disen = self._disen_encode(data)
            post_disen, prior_disen = self._disen_dynamics.observe(
                embed_disen, data['action'])
            feat_disen = self._disen_dynamics.get_feat(post_disen)

            # disen image pred
            image_pred_disen = self._disen_only_decode(feat_disen)

            # joint image pred
            image_pred_joint, image_pred_joint_main, image_pred_joint_disen, mask_pred = self._joint_decode(
                feat, feat_disen)

            # reward pred
            reward_pred = self._reward(feat)

            # optimize disen reward predictor till optimal
            for _ in range(self._c.num_reward_opt_iters):
                with tf.GradientTape() as disen_reward_tape:
                    reward_pred_disen = self._disen_reward(
                        tf.stop_gradient(feat_disen))
                    reward_like_disen = reward_pred_disen.log_prob(
                        data['reward'])
                    reward_loss_disen = -tf.reduce_mean(reward_like_disen)
                    reward_loss_disen /= float(
                        self._strategy.num_replicas_in_sync)
                reward_disen_norm = self._disen_reward_opt(
                    disen_reward_tape, reward_loss_disen)

            # disen reward pred with optimal reward predictor
            reward_pred_disen = self._disen_reward(feat_disen)
            reward_like_disen = tf.reduce_mean(
                reward_pred_disen.log_prob(data['reward']))

            # main model loss
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(
                image_pred_joint.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(
                data['reward'])) * self._c.reward_scale
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale

            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)

            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

            # disen model loss with reward negative gradient
            likes_disen = tools.AttrDict()
            likes_disen.image = tf.reduce_mean(
                image_pred_joint.log_prob(data['image']))
            likes_disen.disen_only = tf.reduce_mean(
                image_pred_disen.log_prob(data['image']))

            reward_like_disen = reward_pred_disen.log_prob(data['reward'])
            reward_like_disen = tf.reduce_mean(reward_like_disen)
            reward_loss_disen = -reward_like_disen

            prior_dist_disen = self._disen_dynamics.get_dist(prior_disen)
            post_dist_disen = self._disen_dynamics.get_dist(post_disen)
            div_disen = tf.reduce_mean(tfd.kl_divergence(
                post_dist_disen, prior_dist_disen))
            div_disen = tf.maximum(div_disen, self._c.free_nats)

            model_loss_disen = div_disen * self._c.disen_kl_scale + \
                reward_like_disen * self._c.disen_neg_rew_scale - \
                likes_disen.image - likes_disen.disen_only * self._c.disen_rec_scale
            model_loss_disen /= float(self._strategy.num_replicas_in_sync)

            decode_loss = model_loss_disen + model_loss

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        model_disen_norm = self._disen_opt(model_tape, model_loss_disen)
        decode_norm = self._decode_opt(model_tape, decode_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm)
                self._scalar_summaries_disen(
                    prior_dist_disen, post_dist_disen, likes_disen, div_disen,
                    model_loss_disen, reward_loss_disen,
                    model_disen_norm, reward_disen_norm)
            if tf.equal(log_images, True):
                self._image_summaries_joint(
                    data, embed, embed_disen, image_pred_joint, mask_pred)
                self._image_summaries(
                    self._disen_dynamics, self._disen_decode, data, embed_disen, image_pred_joint_disen, tag='disen/openl_joint_disen')
                self._image_summaries(
                    self._disen_dynamics, self._disen_only_decode, data, embed_disen, image_pred_disen, tag='disen_only/openl_disen_only')
                self._image_summaries(
                    self._dynamics, self._main_decode, data, embed, image_pred_joint_main, tag='main/openl_joint_main')

    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]

        # Distractor dynamic model
        self._disen_encode = models.ConvEncoder(
            self._c.disen_cnn_depth, cnn_act, self._c.image_size)
        self._disen_dynamics = models.RSSM(
            self._c.disen_stoch_size, self._c.disen_deter_size, self._c.disen_deter_size)
        self._disen_only_decode = models.ConvDecoder(
            self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._disen_reward = models.DenseDecoder(
            (), 2, self._c.num_units, act=act)

        # Task dynamic model
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)

        # Joint decode
        self._main_decode = models.ConvDecoderMask(
            self._c.cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._disen_decode = models.ConvDecoderMask(
            self._c.disen_cnn_depth, cnn_act, (self._c.image_size, self._c.image_size, 3))
        self._joint_decode = models.ConvDecoderMaskEnsemble(
            self._main_decode, self._disen_decode, self._c.precision
        )

        disen_modules = [self._disen_encode,
                         self._disen_dynamics, self._disen_only_decode]
        model_modules = [self._encode, self._dynamics, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)

        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._disen_opt = Optimizer('disen', disen_modules, self._c.model_lr)
        self._decode_opt = Optimizer(
            'decode', [self._joint_decode], self._c.model_lr)
        self._disen_reward_opt = Optimizer(
            'disen_reward', [self._disen_reward], self._c.disen_reward_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))

    def _scalar_summaries_disen(
            self, prior_dist_disen, post_dist_disen, likes_disen, div_disen,
            model_loss_disen, reward_loss_disen,
            model_disen_norm, reward_disen_norm):
        self._metrics_disen['model_grad_norm'].update_state(model_disen_norm)
        self._metrics_disen['reward_grad_norm'].update_state(reward_disen_norm)
        self._metrics_disen['prior_ent'].update_state(
            prior_dist_disen.entropy())
        self._metrics_disen['post_ent'].update_state(post_dist_disen.entropy())
        for name, logprob in likes_disen.items():
            self._metrics_disen[name + '_loss'].update_state(-logprob)
        self._metrics_disen['div'].update_state(div_disen)
        self._metrics_disen['model_loss'].update_state(model_loss_disen)
        self._metrics_disen['reward_loss'].update_state(
            reward_loss_disen)

    def _image_summaries(self, dynamics, decoder, data, embed, image_pred, tag='agent/openl'):
        truth = data['image'][:6] + 0.5
        recon = image_pred.mode()[:6]
        init, _ = dynamics.observe(embed[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        prior = dynamics.imagine(data['action'][:6, 5:], init)
        if isinstance(decoder, models.ConvDecoderMask):
            openl, _ = decoder(dynamics.get_feat(prior))
            openl = openl.mode()
        else:
            openl = decoder(dynamics.get_feat(prior)).mode()
        model = tf.concat([recon[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, tag, openl)

    def _image_summaries_joint(self, data, embed, embed_disen, image_pred_joint, mask_pred):
        truth = data['image'][:6] + 0.5
        recon_joint = image_pred_joint.mode()[:6]
        mask_pred = mask_pred[:6]

        init, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        init_disen, _ = self._disen_dynamics.observe(
            embed_disen[:6, :5], data['action'][:6, :5])
        init = {k: v[:, -1] for k, v in init.items()}
        init_disen = {k: v[:, -1] for k, v in init_disen.items()}
        prior = self._dynamics.imagine(
            data['action'][:6, 5:], init)
        prior_disen = self._disen_dynamics.imagine(
            data['action'][:6, 5:], init_disen)

        feat = self._dynamics.get_feat(prior)
        feat_disen = self._disen_dynamics.get_feat(prior_disen)
        openl, _, _, openl_mask = self._joint_decode(feat, feat_disen)

        openl = openl.mode()
        model = tf.concat([recon_joint[:, :5] + 0.5, openl + 0.5], 1)
        error = (model - truth + 1) / 2
        openl = tf.concat([truth, model, error], 2)
        openl_mask = tf.concat([mask_pred[:, :5] + 0.5, openl_mask + 0.5], 1)

        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'joint/openl_joint', openl)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'mask/openl_mask', openl_mask)

    def image_summary_from_data(self, data):
        truth = data['image'][:6] + 0.5

        # main
        embed = self._encode(data)
        post, _ = self._dynamics.observe(
            embed[:6, :5], data['action'][:6, :5])
        feat = self._dynamics.get_feat(post)
        init = {k: v[:, -1] for k, v in post.items()}

        # disen
        embed_disen = self._disen_encode(data)
        post_disen, _ = self._disen_dynamics.observe(
            embed_disen[:6, :5], data['action'][:6, :5])
        feat_disen = self._disen_dynamics.get_feat(post_disen)
        init_disen = {k: v[:, -1] for k, v in post_disen.items()}

        # joint image pred
        recon_joint, recon_main, recon_disen, recon_mask = self._joint_decode(
            feat, feat_disen)
        recon_joint = recon_joint.mode()[:6]
        recon_main = recon_main.mode()[:6]
        recon_disen = recon_disen.mode()[:6]
        recon_mask = recon_mask[:6]

        prior = self._dynamics.imagine(
            data['action'][:6, 5:], init)
        prior_disen = self._disen_dynamics.imagine(
            data['action'][:6, 5:], init_disen)
        feat = self._dynamics.get_feat(prior)
        feat_disen = self._disen_dynamics.get_feat(prior_disen)

        openl_joint, openl_main, openl_disen, openl_mask = self._joint_decode(
            feat, feat_disen)
        openl_joint = openl_joint.mode()
        openl_main = openl_main.mode()
        openl_disen = openl_disen.mode()

        model_joint = tf.concat(
            [recon_joint[:, :5] + 0.5, openl_joint + 0.5], 1)
        error_joint = (model_joint - truth + 1) / 2
        model_main = tf.concat(
            [recon_main[:, :5] + 0.5, openl_main + 0.5], 1)
        model_disen = tf.concat(
            [recon_disen[:, :5] + 0.5, openl_disen + 0.5], 1)
        model_mask = tf.concat(
            [recon_mask[:, :5] + 0.5, openl_mask + 0.5], 1)

        output_joint = tf.concat(
            [truth, model_main, model_disen, model_joint, error_joint], 2)
        output_mask = model_mask

        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'summary/openl', output_joint)
        tools.graph_summary(
            self._writer, tools.video_summary, self._step, 'summary/openl_mask', output_mask)

    def _write_summaries(self):
        step = int(self._step.numpy())
        metrics = [(k, float(v.result())) for k, v in self._metrics.items()]
        metrics_disen = [(k, float(v.result()))
                         for k, v in self._metrics_disen.items()]
        if self._last_log is not None:
            duration = time.time() - self._last_time
            self._last_time += duration
            metrics.append(('fps', (step - self._last_log) / duration))
        self._last_log = step
        [m.reset_states() for m in self._metrics.values()]
        [m.reset_states() for m in self._metrics_disen.values()]
        with (self._c.logdir / 'metrics.jsonl').open('a') as f:
            f.write(json.dumps({'step': step, **dict(metrics)}) + '\n')
        [tf.summary.scalar('agent/' + k, m) for k, m in metrics]
        [tf.summary.scalar('disen/' + k, m) for k, m in metrics_disen]
        print('#'*30 + ' Main ' + '#'*30)
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics))
        print('#'*30 + ' Disen ' + '#'*30)
        print(f'[{step}]', ' / '.join(f'{k} {v:.1f}' for k, v in metrics_disen))
        self._writer.flush()

class InverseDreamer(Dreamer):

    def __init__(self, config, datadir, actspace, writer):
        super().__init__(config, datadir, actspace, writer)

    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)

            action_pred = self._decode(feat)
            reward_pred = self._reward(feat)

            likes = tools.AttrDict()
            likes.action = tf.reduce_mean(
                action_pred.log_prob(data['action'][:, :-1]))
            likes.reward = tf.reduce_mean(
                reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale

            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)

            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = - \
                tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm,
                    actor_norm)

    def _build_model(self):
        acts = dict(
            elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish,
            leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        self._encode = models.ConvEncoder(
            self._c.cnn_depth, cnn_act, self._c.image_size)
        self._dynamics = models.RSSM(
            self._c.stoch_size, self._c.deter_size, self._c.deter_size)
        self._decode = models.InverseDecoder(
            self._actdim, 4, self._c.num_units, act=act)
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder(
                (), 3, self._c.num_units, 'binary', act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._actor = models.ActionDecoder(
            self._actdim, 4, self._c.num_units, self._c.action_dist,
            init_std=self._c.action_init_std, act=act)
        model_modules = [self._encode, self._dynamics,
                         self._decode, self._reward]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(
            tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip,
            wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self.train(next(self._dataset))