tia/DreamerV2/exploration.py
2021-06-29 21:20:44 -04:00

84 lines
2.8 KiB
Python

import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as prec
from tensorflow_probability import distributions as tfd
import models
import networks
import tools
class Random(tools.Module):
def __init__(self, config):
self._config = config
self._float = prec.global_policy().comput_dtype
def actor(self, feat):
shape = feat.shape[:-1] + [self._config.num_actions]
if self._config.actor_dist == 'onehot':
return tools.OneHotDist(tf.zeros(shape))
else:
ones = tf.ones(shape, self._float)
return tfd.Uniform(-ones, ones)
def train(self, start, feat, embed, kl):
return None, {}
class Plan2Explore(tools.Module):
def __init__(self, config, world_model, reward=None):
self._config = config
self._reward = reward
self._behavior = models.ImagBehavior(config, world_model)
self.actor = self._behavior.actor
size = {
'embed': 32 * config.cnn_depth,
'stoch': config.dyn_stoch,
'deter': config.dyn_deter,
'feat': config.dyn_stoch + config.dyn_deter,
}[self._config.disag_target]
kw = dict(
shape=size, layers=config.disag_layers, units=config.disag_units,
act=config.act)
self._networks = [
networks.DenseHead(**kw) for _ in range(config.disag_models)]
self._opt = tools.Optimizer(
'ensemble', config.model_lr, config.opt_eps, config.grad_clip,
config.weight_decay, opt=config.opt)
def train(self, start, feat, embed, kl):
metrics = {}
target = {
'embed': embed,
'stoch': start['stoch'],
'deter': start['deter'],
'feat': feat,
}[self._config.disag_target]
metrics.update(self._train_ensemble(feat, target))
metrics.update(self._behavior.train(start, self._intrinsic_reward)[-1])
return None, metrics
def _intrinsic_reward(self, feat, state, action):
preds = [head(feat, tf.float32).mean() for head in self._networks]
disag = tf.reduce_mean(tf.math.reduce_std(preds, 0), -1)
if self._config.disag_log:
disag = tf.math.log(disag)
reward = self._config.expl_intr_scale * disag
if self._config.expl_extr_scale:
reward += tf.cast(self._config.expl_extr_scale * self._reward(
feat, state, action), tf.float32)
return reward
def _train_ensemble(self, inputs, targets):
if self._config.disag_offset:
targets = targets[:, self._config.disag_offset:]
inputs = inputs[:, :-self._config.disag_offset]
targets = tf.stop_gradient(targets)
inputs = tf.stop_gradient(inputs)
with tf.GradientTape() as tape:
preds = [head(inputs) for head in self._networks]
likes = [tf.reduce_mean(pred.log_prob(targets)) for pred in preds]
loss = -tf.cast(tf.reduce_sum(likes), tf.float32)
metrics = self._opt(tape, loss, self._networks)
return metrics