defaults: gpu: 'none' logdir: ./ traindir: null evaldir: null offline_traindir: '' offline_evaldir: '' seed: 0 steps: 1e7 eval_every: 1e4 log_every: 1e4 reset_every: 0 gpu_growth: True precision: 32 debug: False expl_gifs: False # Environment task: 'dmc_walker_walk' size: [64, 64] envs: 1 action_repeat: 2 time_limit: 1000 prefill: 2500 eval_noise: 0.0 clip_rewards: 'identity' atari_grayscale: False # Model dyn_cell: 'gru' dyn_hidden: 200 dyn_deter: 200 dyn_stoch: 50 dyn_discrete: 0 dyn_input_layers: 1 dyn_output_layers: 1 dyn_shared: False dyn_mean_act: 'none' dyn_std_act: 'sigmoid2' dyn_min_std: 0.1 grad_heads: ['image', 'reward'] units: 400 reward_layers: 2 discount_layers: 3 value_layers: 3 actor_layers: 4 act: 'elu' cnn_depth: 32 encoder_kernels: [4, 4, 4, 4] decoder_kernels: [5, 5, 6, 6] decoder_thin: True value_head: 'normal' kl_scale: '1.0' kl_balance: '0.8' kl_free: '1.0' pred_discount: False discount_scale: 1.0 reward_scale: 1.0 weight_decay: 0.0 # Training batch_size: 50 batch_length: 50 train_every: 5 train_steps: 1 pretrain: 100 model_lr: 3e-4 value_lr: 8e-5 actor_lr: 8e-5 opt_eps: 1e-5 grad_clip: 100 value_grad_clip: 100 actor_grad_clip: 100 dataset_size: 0 oversample_ends: False slow_value_target: True slow_actor_target: True slow_target_update: 100 slow_target_fraction: 1 opt: 'adam' # Behavior. discount: 0.99 discount_lambda: 0.95 imag_horizon: 15 imag_gradient: 'dynamics' imag_gradient_mix: '0.1' imag_sample: True actor_dist: 'trunc_normal' actor_entropy: '1e-4' actor_state_entropy: 0.0 actor_init_std: 1.0 actor_min_std: 0.1 actor_disc: 5 actor_temp: 0.1 actor_outscale: 0.0 expl_amount: 0.0 eval_state_mean: False collect_dyn_sample: True behavior_stop_grad: True value_decay: 0.0 future_entropy: False # Exploration expl_behavior: 'greedy' expl_until: 0 expl_extr_scale: 0.0 expl_intr_scale: 1.0 disag_target: 'stoch' disag_log: True disag_models: 10 disag_offset: 1 disag_layers: 4 disag_units: 400 atari: # General task: 'atari_demon_attack' steps: 3e7 eval_every: 1e5 log_every: 1e4 prefill: 50000 dataset_size: 2e6 pretrain: 0 precision: 16 # Environment time_limit: 108000 # 30 minutes of game play. atari_grayscale: True action_repeat: 4 eval_noise: 0.001 train_every: 16 train_steps: 1 clip_rewards: 'tanh' # Model grad_heads: ['image', 'reward', 'discount'] dyn_cell: 'gru_layer_norm' pred_discount: True cnn_depth: 48 dyn_deter: 600 dyn_hidden: 600 dyn_stoch: 32 dyn_discrete: 32 reward_layers: 4 discount_layers: 4 value_layers: 4 actor_layers: 4 # Behavior actor_dist: 'onehot' actor_entropy: 'linear(3e-3,3e-4,2.5e6)' expl_amount: 0.0 expl_until: 3e7 discount: 0.995 imag_gradient: 'both' imag_gradient_mix: 'linear(0.1,0,2.5e6)' # Training discount_scale: 5.0 reward_scale: 1 weight_decay: 1e-6 model_lr: 2e-4 kl_scale: 0.1 kl_free: 0.0 actor_lr: 4e-5 value_lr: 1e-4 oversample_ends: True # Disen disen_cnn_depth: 16 disen_only_scale: 1.0 disen_discount_scale: 2000.0 disen_reward_scale: 2000.0 num_reward_opt_iters: 20 debug: debug: True pretrain: 1 prefill: 1 train_steps: 1 batch_size: 10 batch_length: 20