287 lines
10 KiB
Python
287 lines
10 KiB
Python
import numpy as np
|
|
import torch
|
|
import argparse
|
|
import os
|
|
import math
|
|
import gym
|
|
import sys
|
|
import random
|
|
import time
|
|
import json
|
|
import dmc2gym
|
|
import copy
|
|
|
|
import utils
|
|
from logger import Logger
|
|
from video import VideoRecorder
|
|
|
|
from sac_ae import SacAeAgent
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
# environment
|
|
parser.add_argument('--domain_name', default='cheetah')
|
|
parser.add_argument('--task_name', default='run')
|
|
parser.add_argument('--image_size', default=84, type=int)
|
|
parser.add_argument('--action_repeat', default=1, type=int)
|
|
parser.add_argument('--frame_stack', default=3, type=int)
|
|
parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none'])
|
|
parser.add_argument('--resource_files', type=str)
|
|
parser.add_argument('--total_frames', default=10000, type=int)
|
|
# replay buffer
|
|
parser.add_argument('--replay_buffer_capacity', default=1000000, type=int)
|
|
# train
|
|
parser.add_argument('--agent', default='sac_ae', type=str)
|
|
parser.add_argument('--init_steps', default=1000, type=int)
|
|
parser.add_argument('--num_train_steps', default=1000000, type=int)
|
|
parser.add_argument('--batch_size', default=512, type=int)
|
|
parser.add_argument('--hidden_dim', default=1024, type=int)
|
|
# eval
|
|
parser.add_argument('--eval_freq', default=10000, type=int)
|
|
parser.add_argument('--num_eval_episodes', default=10, type=int)
|
|
# critic
|
|
parser.add_argument('--critic_lr', default=1e-3, type=float)
|
|
parser.add_argument('--critic_beta', default=0.9, type=float)
|
|
parser.add_argument('--critic_tau', default=0.01, type=float)
|
|
parser.add_argument('--critic_target_update_freq', default=2, type=int)
|
|
# actor
|
|
parser.add_argument('--actor_lr', default=1e-3, type=float)
|
|
parser.add_argument('--actor_beta', default=0.9, type=float)
|
|
parser.add_argument('--actor_log_std_min', default=-10, type=float)
|
|
parser.add_argument('--actor_log_std_max', default=2, type=float)
|
|
parser.add_argument('--actor_update_freq', default=2, type=int)
|
|
# encoder/decoder
|
|
parser.add_argument('--encoder_type', default='pixel', type=str)
|
|
parser.add_argument('--encoder_feature_dim', default=50, type=int)
|
|
parser.add_argument('--encoder_lr', default=1e-3, type=float)
|
|
parser.add_argument('--encoder_tau', default=0.05, type=float)
|
|
parser.add_argument('--decoder_type', default='pixel', type=str)
|
|
parser.add_argument('--decoder_lr', default=1e-3, type=float)
|
|
parser.add_argument('--decoder_update_freq', default=1, type=int)
|
|
parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float)
|
|
parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float)
|
|
parser.add_argument('--num_layers', default=4, type=int)
|
|
parser.add_argument('--num_filters', default=32, type=int)
|
|
# sac
|
|
parser.add_argument('--discount', default=0.99, type=float)
|
|
parser.add_argument('--init_temperature', default=0.1, type=float)
|
|
parser.add_argument('--alpha_lr', default=1e-4, type=float)
|
|
parser.add_argument('--alpha_beta', default=0.5, type=float)
|
|
# misc
|
|
parser.add_argument('--seed', default=1, type=int)
|
|
parser.add_argument('--work_dir', default='.', type=str)
|
|
parser.add_argument('--save_tb', default=False, action='store_true')
|
|
parser.add_argument('--save_model', default=False, action='store_true')
|
|
parser.add_argument('--save_buffer', default=False, action='store_true')
|
|
parser.add_argument('--save_video', default=False, action='store_true')
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def evaluate(env, agent, video, num_episodes, L, step):
|
|
for i in range(num_episodes):
|
|
obs = env.reset()
|
|
video.init(enabled=(i == 0))
|
|
done = False
|
|
episode_reward = 0
|
|
while not done:
|
|
with utils.eval_mode(agent):
|
|
action = agent.select_action(obs)
|
|
obs, reward, done, _ = env.step(action)
|
|
video.record(env)
|
|
episode_reward += reward
|
|
|
|
video.save('%d.mp4' % step)
|
|
L.log('eval/episode_reward', episode_reward, step)
|
|
L.dump(step)
|
|
|
|
|
|
def make_agent(obs_shape, action_shape, args, device):
|
|
if args.agent == 'sac_ae':
|
|
return SacAeAgent(
|
|
obs_shape=obs_shape,
|
|
action_shape=action_shape,
|
|
device=device,
|
|
hidden_dim=args.hidden_dim,
|
|
discount=args.discount,
|
|
init_temperature=args.init_temperature,
|
|
alpha_lr=args.alpha_lr,
|
|
alpha_beta=args.alpha_beta,
|
|
actor_lr=args.actor_lr,
|
|
actor_beta=args.actor_beta,
|
|
actor_log_std_min=args.actor_log_std_min,
|
|
actor_log_std_max=args.actor_log_std_max,
|
|
actor_update_freq=args.actor_update_freq,
|
|
critic_lr=args.critic_lr,
|
|
critic_beta=args.critic_beta,
|
|
critic_tau=args.critic_tau,
|
|
critic_target_update_freq=args.critic_target_update_freq,
|
|
encoder_type=args.encoder_type,
|
|
encoder_feature_dim=args.encoder_feature_dim,
|
|
encoder_lr=args.encoder_lr,
|
|
encoder_tau=args.encoder_tau,
|
|
decoder_type=args.decoder_type,
|
|
decoder_lr=args.decoder_lr,
|
|
decoder_update_freq=args.decoder_update_freq,
|
|
decoder_latent_lambda=args.decoder_latent_lambda,
|
|
decoder_weight_lambda=args.decoder_weight_lambda,
|
|
num_layers=args.num_layers,
|
|
num_filters=args.num_filters
|
|
)
|
|
else:
|
|
assert 'agent is not supported: %s' % args.agent
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
utils.set_seed_everywhere(args.seed)
|
|
|
|
env = dmc2gym.make(
|
|
domain_name=args.domain_name,
|
|
task_name=args.task_name,
|
|
seed=args.seed,
|
|
visualize_reward=False,
|
|
from_pixels=(args.encoder_type == 'pixel'),
|
|
height=args.image_size,
|
|
width=args.image_size,
|
|
frame_skip=args.action_repeat,
|
|
img_source=args.img_source,
|
|
resource_files=args.resource_files,
|
|
total_frames=args.total_frames
|
|
)
|
|
env.seed(args.seed)
|
|
|
|
# stack several consecutive frames together
|
|
if args.encoder_type == 'pixel':
|
|
env = utils.FrameStack(env, k=args.frame_stack)
|
|
|
|
utils.make_dir(args.work_dir)
|
|
video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
|
|
model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
|
|
buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))
|
|
|
|
video = VideoRecorder(video_dir if args.save_video else None)
|
|
|
|
with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
|
|
json.dump(vars(args), f, sort_keys=True, indent=4)
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
# the dmc2gym wrapper standardizes actions
|
|
assert env.action_space.low.min() >= -1
|
|
assert env.action_space.high.max() <= 1
|
|
|
|
replay_buffer = utils.ReplayBuffer(
|
|
obs_shape=env.observation_space.shape,
|
|
action_shape=env.action_space.shape,
|
|
capacity=args.replay_buffer_capacity,
|
|
batch_size=args.batch_size,
|
|
device=device
|
|
)
|
|
|
|
agent = make_agent(
|
|
obs_shape=env.observation_space.shape,
|
|
action_shape=env.action_space.shape,
|
|
args=args,
|
|
device=device
|
|
)
|
|
|
|
L = Logger(args.work_dir, use_tb=args.save_tb)
|
|
|
|
episode, episode_reward, done = 0, 0, True
|
|
start_time = time.time()
|
|
for step in range(args.num_train_steps):
|
|
if done:
|
|
if step > 0:
|
|
L.log('train/duration', time.time() - start_time, step)
|
|
start_time = time.time()
|
|
L.dump(step)
|
|
|
|
# evaluate agent periodically
|
|
if step % args.eval_freq == 0:
|
|
L.log('eval/episode', episode, step)
|
|
evaluate(env, agent, video, args.num_eval_episodes, L, step)
|
|
if args.save_model:
|
|
agent.save(model_dir, step)
|
|
if args.save_buffer:
|
|
replay_buffer.save(buffer_dir)
|
|
|
|
L.log('train/episode_reward', episode_reward, step)
|
|
|
|
obs = env.reset()
|
|
done = False
|
|
episode_reward = 0
|
|
episode_step = 0
|
|
episode += 1
|
|
|
|
L.log('train/episode', episode, step)
|
|
|
|
if episode_step == 0:
|
|
last_obs = obs
|
|
# sample action for data collection
|
|
if step < args.init_steps:
|
|
last_action = env.action_space.sample()
|
|
else:
|
|
with utils.eval_mode(agent):
|
|
last_action = agent.sample_action(last_obs)
|
|
|
|
curr_obs, last_reward, last_done, _ = env.step(last_action)
|
|
|
|
# allow infinit bootstrap
|
|
last_done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(last_done)
|
|
episode_reward += last_reward
|
|
|
|
# sample action for data collection
|
|
if step < args.init_steps:
|
|
action = env.action_space.sample()
|
|
else:
|
|
with utils.eval_mode(agent):
|
|
action = agent.sample_action(curr_obs)
|
|
|
|
next_obs, reward, done, _ = env.step(action)
|
|
|
|
# allow infinit bootstrap
|
|
done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
|
|
episode_reward += reward
|
|
|
|
replay_buffer.add(last_obs, last_action, last_reward, curr_obs, last_done_bool, action, reward, next_obs, done_bool)
|
|
|
|
last_obs = curr_obs
|
|
last_action = action
|
|
last_reward = reward
|
|
last_done = done
|
|
curr_obs = next_obs
|
|
|
|
# sample action for data collection
|
|
if step < args.init_steps:
|
|
action = env.action_space.sample()
|
|
else:
|
|
with utils.eval_mode(agent):
|
|
action = agent.sample_action(curr_obs)
|
|
|
|
|
|
# run training update
|
|
if step >= args.init_steps:
|
|
#num_updates = args.init_steps if step == args.init_steps else 1
|
|
num_updates = 1 if step == args.init_steps else 1
|
|
for _ in range(num_updates):
|
|
agent.update(replay_buffer, L, step)
|
|
|
|
next_obs, reward, done, _ = env.step(action)
|
|
|
|
# allow infinit bootstrap
|
|
done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
|
|
episode_reward += reward
|
|
|
|
#replay_buffer.add(obs, action, reward, next_obs, done_bool)
|
|
replay_buffer.add(last_obs, last_action, last_reward, curr_obs, last_done_bool, action, reward, next_obs, done_bool)
|
|
|
|
obs = next_obs
|
|
episode_step += 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|