diff --git a/encoder.py b/encoder.py index be32fd6..b51b784 100644 --- a/encoder.py +++ b/encoder.py @@ -109,6 +109,7 @@ class PixelEncoder(nn.Module): out_dim = OUT_DIM[num_layers] self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim * 2) self.ln = nn.LayerNorm(self.feature_dim * 2) + self.combine = nn.Linear(self.feature_dim + 6, self.feature_dim) self.outputs = dict() @@ -153,7 +154,7 @@ class PixelEncoder(nn.Module): out = self.reparameterize(mu, logstd) self.outputs['tanh'] = out - return out + return out, mu, logstd def copy_conv_weights_from(self, source): """Tie convolutional layers""" @@ -202,3 +203,101 @@ def make_encoder( return _AVAILABLE_ENCODERS[encoder_type]( obs_shape, feature_dim, num_layers, num_filters ) + +def club_loss(x_samples, x_mu, x_logvar, y_samples): + sample_size = x_samples.shape[0] + random_index = torch.randperm(sample_size).long() + + positive = -(x_mu - y_samples)**2 / x_logvar.exp() + negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp() + upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() + return upper_bound/2. + + +class TransitionModel(nn.Module): + def __init__(self, state_size, hidden_size, action_size, history_size): + super().__init__() + + self.state_size = state_size + self.hidden_size = hidden_size + self.action_size = action_size + self.history_size = history_size + self.act_fn = nn.ELU() + + self.fc_state_action = nn.Linear(state_size + action_size, hidden_size) + self.fc_hidden = nn.Linear(hidden_size, hidden_size) + self.history_cell = nn.GRUCell(hidden_size, history_size) + self.fc_state_mu = nn.Linear(history_size + hidden_size, state_size) + self.fc_state_sigma = nn.Linear(history_size + hidden_size, state_size) + + self.min_sigma = 1e-4 + self.max_sigma = 1e0 + + def init_states(self, batch_size, device): + self.prev_state = torch.zeros(batch_size, self.state_size).to(device) + self.prev_action = torch.zeros(batch_size, self.action_size).to(device) + self.prev_history = torch.zeros(batch_size, self.history_size).to(device) + + def get_dist(self, mean, std): + distribution = torch.distributions.Normal(mean, std) + return distribution + + def stack_states(self, states, dim=0): + s = dict( + mean = torch.stack([state['mean'] for state in states], dim=dim), + std = torch.stack([state['std'] for state in states], dim=dim), + sample = torch.stack([state['sample'] for state in states], dim=dim), + history = torch.stack([state['history'] for state in states], dim=dim),) + if 'distribution' in states: + dist = dict(distribution = [state['distribution'] for state in states]) + s.update(dist) + return s + + def seq_to_batch(self, state, name): + return dict( + sample = torch.reshape(state[name], (state[name].shape[0]* state[name].shape[1], *state[name].shape[2:]))) + + def transition_step(self, state, action, hist, not_done): + state = state * not_done + hist = hist * not_done + + state_action_enc = self.act_fn(self.fc_state_action(torch.cat([state, action], dim=-1))) + state_action_enc = self.act_fn(self.fc_hidden(state_action_enc)) + state_action_enc = self.act_fn(self.fc_hidden(state_action_enc)) + state_action_enc = self.act_fn(self.fc_hidden(state_action_enc)) + + current_hist = self.history_cell(state_action_enc, hist) + next_state_mu = self.act_fn(self.fc_state_mu(torch.cat([state_action_enc, hist], dim=-1))) + next_state_sigma = torch.tanh(self.fc_state_sigma(torch.cat([state_action_enc, hist], dim=-1))) + next_state = next_state_mu + torch.randn_like(next_state_mu) * next_state_sigma.exp() + + state_enc = {"mean": next_state_mu, "logvar": next_state_sigma, "sample": next_state, "history": current_hist} + return state_enc + + def observe_rollout(self, rollout_states, rollout_actions, init_history, nonterms): + observed_rollout = [] + for i in range(rollout_states.shape[0]): + rollout_states_ = rollout_states[i] + rollout_actions_ = rollout_actions[i] + init_history_ = nonterms[i] * init_history + state_enc = self.observe_step(rollout_states_, rollout_actions_, init_history_) + init_history = state_enc["history"] + observed_rollout.append(state_enc) + observed_rollout = self.stack_states(observed_rollout, dim=0) + return observed_rollout + + def forward(self, state, action, hist, not_done): + return self.transition_step(state, action, hist, not_done) + + def reparameterize(self, mean, std): + eps = torch.randn_like(mean) + return mean + eps * std + +def club_loss(x_samples, x_mu, x_logvar, y_samples): + sample_size = x_samples.shape[0] + random_index = torch.randperm(sample_size).long() + + positive = -(x_mu - y_samples)**2 / x_logvar.exp() + negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp() + upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() + return upper_bound/2.0 \ No newline at end of file diff --git a/graphs_plot.py b/graphs_plot.py new file mode 100644 index 0000000..6e99084 --- /dev/null +++ b/graphs_plot.py @@ -0,0 +1,49 @@ +import os +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + + +def tabulate_events(dpath): + files = os.listdir(dpath)[0] + summary_iterators = [EventAccumulator(os.path.join(dpath, files)).Reload()] + + tags = summary_iterators[0].Tags()['scalars'] + + for it in summary_iterators: + assert it.Tags()['scalars'] == tags + + out = {t: [] for t in tags} + steps = [] + + for tag in tags: + steps = [e.step for e in summary_iterators[0].Scalars(tag)] + + for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]): + assert len(set(e.step for e in events)) == 1 + + out[tag].append([e.value for e in events]) + + return out, steps + +events, steps = tabulate_events('/home/vedant/pytorch_sac_ae/log/runs') + +data = [] + +for tag, values in events.items(): + for run_idx, run_values in enumerate(values): + for step_idx, value in enumerate(run_values): + data.append({ + 'tag': tag, + 'run': run_idx, + 'step': steps[step_idx], + 'value': value, + }) + +df = pd.DataFrame(data) +print(df.head()) + +plt.figure(figsize=(10,6)) +sns.lineplot(data=df, x='step', y='value', hue='tag', ci='sd') +plt.show() \ No newline at end of file diff --git a/logger.py b/logger.py index 8e31fd4..3a2adda 100644 --- a/logger.py +++ b/logger.py @@ -7,6 +7,7 @@ import torch import torchvision import numpy as np from termcolor import colored +from datetime import datetime FORMAT_CONFIG = { 'rl': { @@ -93,8 +94,10 @@ class MetersGroup(object): class Logger(object): def __init__(self, log_dir, use_tb=True, config='rl'): self._log_dir = log_dir + now = datetime.now() + dt_string = now.strftime("%d_%m_%Y-%H_%M_%S") if use_tb: - tb_dir = os.path.join(log_dir, 'tb') + tb_dir = os.path.join(log_dir, 'runs/tb_'+dt_string) if os.path.exists(tb_dir): shutil.rmtree(tb_dir) self._sw = SummaryWriter(tb_dir) diff --git a/sac_ae.py b/sac_ae.py index 0e5d915..eae526f 100644 --- a/sac_ae.py +++ b/sac_ae.py @@ -6,7 +6,7 @@ import copy import math import utils -from encoder import make_encoder +from encoder import make_encoder, club_loss, TransitionModel from decoder import make_decoder LOG_FREQ = 10000 @@ -73,7 +73,7 @@ class Actor(nn.Module): def forward( self, obs, compute_pi=True, compute_log_pi=True, detach_encoder=False ): - obs = self.encoder(obs, detach=detach_encoder) + obs,_,_ = self.encoder(obs, detach=detach_encoder) mu, log_std = self.trunk(obs).chunk(2, dim=-1) @@ -159,7 +159,7 @@ class Critic(nn.Module): def forward(self, obs, action, detach_encoder=False): # detach_encoder allows to stop gradient propogation to encoder - obs = self.encoder(obs, detach=detach_encoder) + obs,_,_ = self.encoder(obs, detach=detach_encoder) q1 = self.Q1(obs, action) q2 = self.Q2(obs, action) @@ -182,6 +182,26 @@ class Critic(nn.Module): L.log_param('train_critic/q1_fc%d' % i, self.Q1.trunk[i * 2], step) L.log_param('train_critic/q2_fc%d' % i, self.Q2.trunk[i * 2], step) +class LBLoss(nn.Module): + def __init__(self, z_dim): + super(LBLoss, self).__init__() + self.z_dim = z_dim + + self.W = nn.Parameter(torch.rand(z_dim, z_dim)) + + def compute_logits(self, z_a, z_pos): + """ + Uses logits trick for CURL: + - compute (B,B) matrix z_a (W z_pos.T) + - positives are all diagonal elements + - negatives are all other elements + - to compute loss use multiclass cross entropy with identity matrix for labels + """ + Wz = torch.matmul(self.W, z_pos.T) # (z_dim,B) + logits = torch.matmul(z_a, Wz) # (B,B) + logits = logits - torch.max(logits, 1)[0][:, None] + return logits + class SacAeAgent(object): """SAC+AE algorithm.""" @@ -240,7 +260,13 @@ class SacAeAgent(object): obs_shape, action_shape, hidden_dim, encoder_type, encoder_feature_dim, num_layers, num_filters ).to(device) + + self.transition_model = TransitionModel( + encoder_feature_dim, hidden_dim, action_shape[0], history_size=256 + ).to(device) + self.lb_loss = LBLoss(encoder_feature_dim).to(device) + self.critic_target.load_state_dict(self.critic.state_dict()) # tie encoders between actor and critic @@ -262,7 +288,10 @@ class SacAeAgent(object): # optimizer for critic encoder for reconstruction loss self.encoder_optimizer = torch.optim.Adam( - self.critic.encoder.parameters(), lr=encoder_lr + list(self.critic.encoder.parameters()) + + list(self.transition_model.parameters()), #+ + #list(self.lb_loss.parameters()), + lr=encoder_lr ) # optimizer for decoder @@ -366,32 +395,70 @@ class SacAeAgent(object): alpha_loss.backward() self.log_alpha_optimizer.step() - def update_decoder(self, obs, target_obs, L, step): - h = self.critic.encoder(obs) + def update_decoder(self, obs, target_obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list): + with torch.no_grad(): + hist = torch.zeros((target_obs.shape[0], 256)).to(self.device) + for i in range(len(obs_list)-1): + state, _, _ = self.critic.encoder(obs_list[i]) + action = action_list[i] + not_done = not_done_list[i] + state_enc = self.transition_model(state, action, hist, not_done) + hist = state_enc["history"] + + h, h_mu, h_logvar = self.critic.encoder(obs_list[-1]) + h_clone = h.clone() + + action = action_list[-1] + not_done = not_done_list[-1] + state_enc = self.transition_model(h, action, hist, not_done) + mean, std = state_enc["mean"], state_enc["logvar"].exp() + h_dist_enc = torch.distributions.Normal(h_mu, h_logvar.exp()) + h_dist_pred = torch.distributions.Normal(mean, std) + enc_loss = torch.distributions.kl.kl_divergence(h_dist_enc, h_dist_pred).mean() * 1e-2 + + """ + with torch.no_grad(): + z_pos, _ , _ = self.critic_target.encoder(next_obs_list[-1]) + z_out = self.critic_target.encoder.combine(torch.concat((z_pos, action), dim=-1)) + logits = self.lb_loss.compute_logits(h, z_out) + labels = torch.arange(logits.shape[0]).long().to(self.device) + lb_loss = nn.CrossEntropyLoss()(logits, labels) * 1e-2 + """ + #with torch.no_grad(): + # z_pos, _ , _ = self.critic.encoder(next_obs_list[-1]) + #ub_loss = club_loss(state_enc["sample"], mean, state_enc["logvar"], h) * 1e-1 + if target_obs.dim() == 4: # preprocess images to be in [-0.5, 0.5] range target_obs = utils.preprocess_obs(target_obs) - rec_obs = self.decoder(h) + rec_obs = self.decoder(h_clone) rec_loss = F.mse_loss(target_obs, rec_obs) - # add L2 penalty on latent representation - # see https://arxiv.org/pdf/1903.12436.pdf - latent_loss = (0.5 * h.pow(2).sum(1)).mean() - - loss = rec_loss + self.decoder_latent_lambda * latent_loss + ub_loss = torch.tensor(0.0) + #enc_loss = torch.tensor(0.0) + lb_loss = torch.tensor(0.0) + #rec_loss = torch.tensor(0.0) + loss = rec_loss + enc_loss + lb_loss + ub_loss self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() loss.backward() self.encoder_optimizer.step() self.decoder_optimizer.step() + + #enc_loss = torch.tensor(0.0) L.log('train_ae/ae_loss', loss, step) + L.log('train_ae/rec_loss', rec_loss, step) + L.log('train_ae/enc_loss', enc_loss, step) + L.log('train_ae/lb_loss', lb_loss, step) + L.log('train_ae/ub_loss', ub_loss, step) self.decoder.log(L, step, log_freq=LOG_FREQ) def update(self, replay_buffer, L, step): - obs, action, reward, next_obs, not_done = replay_buffer.sample() + obs_list, action_list, reward_list, next_obs_list, not_done_list = replay_buffer.sample() + obs, action, reward, next_obs, not_done = obs_list[-1], action_list[-1], reward_list[-1], next_obs_list[-1], not_done_list[-1] L.log('train/batch_reward', reward.mean(), step) @@ -413,7 +480,7 @@ class SacAeAgent(object): ) if self.decoder is not None and step % self.decoder_update_freq == 0: - self.update_decoder(obs, obs, L, step) + self.update_decoder(obs, obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list) def save(self, model_dir, step): torch.save( diff --git a/train.py b/train.py index 4f6cde4..75314b5 100644 --- a/train.py +++ b/train.py @@ -26,35 +26,39 @@ def parse_args(): parser.add_argument('--image_size', default=84, type=int) parser.add_argument('--action_repeat', default=1, type=int) parser.add_argument('--frame_stack', default=3, type=int) + parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none']) + parser.add_argument('--resource_files', type=str) + parser.add_argument('--resource_files_test', type=str) + parser.add_argument('--total_frames', default=10000, type=int) # replay buffer - parser.add_argument('--replay_buffer_capacity', default=1000000, type=int) + parser.add_argument('--replay_buffer_capacity', default=100000, type=int) # train parser.add_argument('--agent', default='sac_ae', type=str) parser.add_argument('--init_steps', default=1000, type=int) - parser.add_argument('--num_train_steps', default=1000000, type=int) - parser.add_argument('--batch_size', default=128, type=int) + parser.add_argument('--num_train_steps', default=2000000, type=int) + parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--hidden_dim', default=1024, type=int) # eval parser.add_argument('--eval_freq', default=10000, type=int) parser.add_argument('--num_eval_episodes', default=10, type=int) # critic - parser.add_argument('--critic_lr', default=1e-3, type=float) + parser.add_argument('--critic_lr', default=1e-4, type=float) parser.add_argument('--critic_beta', default=0.9, type=float) parser.add_argument('--critic_tau', default=0.01, type=float) parser.add_argument('--critic_target_update_freq', default=2, type=int) # actor - parser.add_argument('--actor_lr', default=1e-3, type=float) + parser.add_argument('--actor_lr', default=1e-4, type=float) parser.add_argument('--actor_beta', default=0.9, type=float) parser.add_argument('--actor_log_std_min', default=-10, type=float) parser.add_argument('--actor_log_std_max', default=2, type=float) parser.add_argument('--actor_update_freq', default=2, type=int) # encoder/decoder parser.add_argument('--encoder_type', default='pixel', type=str) - parser.add_argument('--encoder_feature_dim', default=50, type=int) - parser.add_argument('--encoder_lr', default=1e-3, type=float) + parser.add_argument('--encoder_feature_dim', default=250, type=int) + parser.add_argument('--encoder_lr', default=1e-4, type=float) parser.add_argument('--encoder_tau', default=0.05, type=float) parser.add_argument('--decoder_type', default='pixel', type=str) - parser.add_argument('--decoder_lr', default=1e-3, type=float) + parser.add_argument('--decoder_lr', default=1e-4, type=float) parser.add_argument('--decoder_update_freq', default=1, type=int) parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float) parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float) @@ -143,13 +147,32 @@ def main(): from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, - frame_skip=args.action_repeat + frame_skip=args.action_repeat, + img_source=args.img_source, + resource_files=args.resource_files, + total_frames=args.total_frames ) env.seed(args.seed) + env_test = dmc2gym.make( + domain_name=args.domain_name, + task_name=args.task_name, + seed=args.seed, + visualize_reward=False, + from_pixels=(args.encoder_type == 'pixel'), + height=args.image_size, + width=args.image_size, + frame_skip=args.action_repeat, + img_source=args.img_source, + resource_files=args.resource_files_test, + total_frames=args.total_frames + ) + env_test.seed(args.seed) + # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) + env_test = utils.FrameStack(env_test, k=args.frame_stack) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) @@ -196,7 +219,7 @@ def main(): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) - evaluate(env, agent, video, args.num_eval_episodes, L, step) + evaluate(env_test, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: diff --git a/utils.py b/utils.py index 067715c..2b7eda7 100644 --- a/utils.py +++ b/utils.py @@ -96,17 +96,17 @@ class ReplayBuffer(object): self.full = self.full or self.idx == 0 def sample(self): + begin = 2 idxs = np.random.randint( - 0, self.capacity if self.full else self.idx, size=self.batch_size + begin, self.capacity if self.full else self.idx, size=self.batch_size ) + past_idxs = idxs - begin - obses = torch.as_tensor(self.obses[idxs], device=self.device).float() - actions = torch.as_tensor(self.actions[idxs], device=self.device) - rewards = torch.as_tensor(self.rewards[idxs], device=self.device) - next_obses = torch.as_tensor( - self.next_obses[idxs], device=self.device - ).float() - not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device) + obses = torch.as_tensor(np.swapaxes(np.asarray([self.obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float() + actions = torch.as_tensor(np.swapaxes(np.asarray([self.actions[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device) + rewards = torch.as_tensor(np.swapaxes(np.asarray([self.rewards[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device) + next_obses = torch.as_tensor(np.swapaxes(np.asarray([self.next_obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float() + not_dones = torch.as_tensor(np.swapaxes(np.asarray([self.not_dones[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device) return obses, actions, rewards, next_obses, not_dones