Compare commits

..

4 Commits

Author SHA1 Message Date
ca334452a0 Adding Files 2023-05-24 19:53:22 +02:00
2762254803 Adding Files 2023-05-24 19:51:34 +02:00
23f7c14c8e Adding files 2023-05-24 19:43:02 +02:00
fdd13b956d Adding encoder 2023-05-22 13:52:02 +02:00
5 changed files with 273 additions and 31 deletions

View File

@ -109,6 +109,10 @@ class PixelEncoder(nn.Module):
out_dim = OUT_DIM[num_layers] out_dim = OUT_DIM[num_layers]
self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim * 2) self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim * 2)
self.ln = nn.LayerNorm(self.feature_dim * 2) self.ln = nn.LayerNorm(self.feature_dim * 2)
<<<<<<< HEAD
self.combine = nn.Linear(self.feature_dim + 6, self.feature_dim)
=======
>>>>>>> origin/tester_1
self.outputs = dict() self.outputs = dict()
@ -153,7 +157,11 @@ class PixelEncoder(nn.Module):
out = self.reparameterize(mu, logstd) out = self.reparameterize(mu, logstd)
self.outputs['tanh'] = out self.outputs['tanh'] = out
<<<<<<< HEAD
return out, mu, logstd
=======
return out return out
>>>>>>> origin/tester_1
def copy_conv_weights_from(self, source): def copy_conv_weights_from(self, source):
"""Tie convolutional layers""" """Tie convolutional layers"""
@ -202,3 +210,101 @@ def make_encoder(
return _AVAILABLE_ENCODERS[encoder_type]( return _AVAILABLE_ENCODERS[encoder_type](
obs_shape, feature_dim, num_layers, num_filters obs_shape, feature_dim, num_layers, num_filters
) )
def club_loss(x_samples, x_mu, x_logvar, y_samples):
sample_size = x_samples.shape[0]
random_index = torch.randperm(sample_size).long()
positive = -(x_mu - y_samples)**2 / x_logvar.exp()
negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp()
upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
return upper_bound/2.
class TransitionModel(nn.Module):
def __init__(self, state_size, hidden_size, action_size, history_size):
super().__init__()
self.state_size = state_size
self.hidden_size = hidden_size
self.action_size = action_size
self.history_size = history_size
self.act_fn = nn.ELU()
self.fc_state_action = nn.Linear(state_size + action_size, hidden_size)
self.fc_hidden = nn.Linear(hidden_size, hidden_size)
self.history_cell = nn.GRUCell(hidden_size, history_size)
self.fc_state_mu = nn.Linear(history_size + hidden_size, state_size)
self.fc_state_sigma = nn.Linear(history_size + hidden_size, state_size)
self.min_sigma = 1e-4
self.max_sigma = 1e0
def init_states(self, batch_size, device):
self.prev_state = torch.zeros(batch_size, self.state_size).to(device)
self.prev_action = torch.zeros(batch_size, self.action_size).to(device)
self.prev_history = torch.zeros(batch_size, self.history_size).to(device)
def get_dist(self, mean, std):
distribution = torch.distributions.Normal(mean, std)
return distribution
def stack_states(self, states, dim=0):
s = dict(
mean = torch.stack([state['mean'] for state in states], dim=dim),
std = torch.stack([state['std'] for state in states], dim=dim),
sample = torch.stack([state['sample'] for state in states], dim=dim),
history = torch.stack([state['history'] for state in states], dim=dim),)
if 'distribution' in states:
dist = dict(distribution = [state['distribution'] for state in states])
s.update(dist)
return s
def seq_to_batch(self, state, name):
return dict(
sample = torch.reshape(state[name], (state[name].shape[0]* state[name].shape[1], *state[name].shape[2:])))
def transition_step(self, state, action, hist, not_done):
state = state * not_done
hist = hist * not_done
state_action_enc = self.act_fn(self.fc_state_action(torch.cat([state, action], dim=-1)))
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
current_hist = self.history_cell(state_action_enc, hist)
next_state_mu = self.act_fn(self.fc_state_mu(torch.cat([state_action_enc, hist], dim=-1)))
next_state_sigma = torch.tanh(self.fc_state_sigma(torch.cat([state_action_enc, hist], dim=-1)))
next_state = next_state_mu + torch.randn_like(next_state_mu) * next_state_sigma.exp()
state_enc = {"mean": next_state_mu, "logvar": next_state_sigma, "sample": next_state, "history": current_hist}
return state_enc
def observe_rollout(self, rollout_states, rollout_actions, init_history, nonterms):
observed_rollout = []
for i in range(rollout_states.shape[0]):
rollout_states_ = rollout_states[i]
rollout_actions_ = rollout_actions[i]
init_history_ = nonterms[i] * init_history
state_enc = self.observe_step(rollout_states_, rollout_actions_, init_history_)
init_history = state_enc["history"]
observed_rollout.append(state_enc)
observed_rollout = self.stack_states(observed_rollout, dim=0)
return observed_rollout
def forward(self, state, action, hist, not_done):
return self.transition_step(state, action, hist, not_done)
def reparameterize(self, mean, std):
eps = torch.randn_like(mean)
return mean + eps * std
def club_loss(x_samples, x_mu, x_logvar, y_samples):
sample_size = x_samples.shape[0]
random_index = torch.randperm(sample_size).long()
positive = -(x_mu - y_samples)**2 / x_logvar.exp()
negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp()
upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
return upper_bound/2.0

49
graphs_plot.py Normal file
View File

@ -0,0 +1,49 @@
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
files = os.listdir(dpath)[0]
summary_iterators = [EventAccumulator(os.path.join(dpath, files)).Reload()]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = {t: [] for t in tags}
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
events, steps = tabulate_events('/home/vedant/pytorch_sac_ae/log/runs')
data = []
for tag, values in events.items():
for run_idx, run_values in enumerate(values):
for step_idx, value in enumerate(run_values):
data.append({
'tag': tag,
'run': run_idx,
'step': steps[step_idx],
'value': value,
})
df = pd.DataFrame(data)
print(df.head())
plt.figure(figsize=(10,6))
sns.lineplot(data=df, x='step', y='value', hue='tag', ci='sd')
plt.show()

View File

@ -6,7 +6,7 @@ import copy
import math import math
import utils import utils
from encoder import make_encoder from encoder import make_encoder, club_loss, TransitionModel
from decoder import make_decoder from decoder import make_decoder
LOG_FREQ = 10000 LOG_FREQ = 10000
@ -73,7 +73,7 @@ class Actor(nn.Module):
def forward( def forward(
self, obs, compute_pi=True, compute_log_pi=True, detach_encoder=False self, obs, compute_pi=True, compute_log_pi=True, detach_encoder=False
): ):
obs = self.encoder(obs, detach=detach_encoder) obs,_,_ = self.encoder(obs, detach=detach_encoder)
mu, log_std = self.trunk(obs).chunk(2, dim=-1) mu, log_std = self.trunk(obs).chunk(2, dim=-1)
@ -159,7 +159,7 @@ class Critic(nn.Module):
def forward(self, obs, action, detach_encoder=False): def forward(self, obs, action, detach_encoder=False):
# detach_encoder allows to stop gradient propogation to encoder # detach_encoder allows to stop gradient propogation to encoder
obs = self.encoder(obs, detach=detach_encoder) obs,_,_ = self.encoder(obs, detach=detach_encoder)
q1 = self.Q1(obs, action) q1 = self.Q1(obs, action)
q2 = self.Q2(obs, action) q2 = self.Q2(obs, action)
@ -182,6 +182,26 @@ class Critic(nn.Module):
L.log_param('train_critic/q1_fc%d' % i, self.Q1.trunk[i * 2], step) L.log_param('train_critic/q1_fc%d' % i, self.Q1.trunk[i * 2], step)
L.log_param('train_critic/q2_fc%d' % i, self.Q2.trunk[i * 2], step) L.log_param('train_critic/q2_fc%d' % i, self.Q2.trunk[i * 2], step)
class LBLoss(nn.Module):
def __init__(self, z_dim):
super(LBLoss, self).__init__()
self.z_dim = z_dim
self.W = nn.Parameter(torch.rand(z_dim, z_dim))
def compute_logits(self, z_a, z_pos):
"""
Uses logits trick for CURL:
- compute (B,B) matrix z_a (W z_pos.T)
- positives are all diagonal elements
- negatives are all other elements
- to compute loss use multiclass cross entropy with identity matrix for labels
"""
Wz = torch.matmul(self.W, z_pos.T) # (z_dim,B)
logits = torch.matmul(z_a, Wz) # (B,B)
logits = logits - torch.max(logits, 1)[0][:, None]
return logits
class SacAeAgent(object): class SacAeAgent(object):
"""SAC+AE algorithm.""" """SAC+AE algorithm."""
@ -241,6 +261,12 @@ class SacAeAgent(object):
encoder_feature_dim, num_layers, num_filters encoder_feature_dim, num_layers, num_filters
).to(device) ).to(device)
self.transition_model = TransitionModel(
encoder_feature_dim, hidden_dim, action_shape[0], history_size=256
).to(device)
self.lb_loss = LBLoss(encoder_feature_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict())
# tie encoders between actor and critic # tie encoders between actor and critic
@ -262,7 +288,10 @@ class SacAeAgent(object):
# optimizer for critic encoder for reconstruction loss # optimizer for critic encoder for reconstruction loss
self.encoder_optimizer = torch.optim.Adam( self.encoder_optimizer = torch.optim.Adam(
self.critic.encoder.parameters(), lr=encoder_lr list(self.critic.encoder.parameters()) +
list(self.transition_model.parameters()), #+
#list(self.lb_loss.parameters()),
lr=encoder_lr
) )
# optimizer for decoder # optimizer for decoder
@ -366,32 +395,70 @@ class SacAeAgent(object):
alpha_loss.backward() alpha_loss.backward()
self.log_alpha_optimizer.step() self.log_alpha_optimizer.step()
def update_decoder(self, obs, target_obs, L, step): def update_decoder(self, obs, target_obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list):
h = self.critic.encoder(obs) with torch.no_grad():
hist = torch.zeros((target_obs.shape[0], 256)).to(self.device)
for i in range(len(obs_list)-1):
state, _, _ = self.critic.encoder(obs_list[i])
action = action_list[i]
not_done = not_done_list[i]
state_enc = self.transition_model(state, action, hist, not_done)
hist = state_enc["history"]
h, h_mu, h_logvar = self.critic.encoder(obs_list[-1])
h_clone = h.clone()
action = action_list[-1]
not_done = not_done_list[-1]
state_enc = self.transition_model(h, action, hist, not_done)
mean, std = state_enc["mean"], state_enc["logvar"].exp()
h_dist_enc = torch.distributions.Normal(h_mu, h_logvar.exp())
h_dist_pred = torch.distributions.Normal(mean, std)
enc_loss = torch.distributions.kl.kl_divergence(h_dist_enc, h_dist_pred).mean() * 1e-2
"""
with torch.no_grad():
z_pos, _ , _ = self.critic_target.encoder(next_obs_list[-1])
z_out = self.critic_target.encoder.combine(torch.concat((z_pos, action), dim=-1))
logits = self.lb_loss.compute_logits(h, z_out)
labels = torch.arange(logits.shape[0]).long().to(self.device)
lb_loss = nn.CrossEntropyLoss()(logits, labels) * 1e-2
"""
#with torch.no_grad():
# z_pos, _ , _ = self.critic.encoder(next_obs_list[-1])
#ub_loss = club_loss(state_enc["sample"], mean, state_enc["logvar"], h) * 1e-1
if target_obs.dim() == 4: if target_obs.dim() == 4:
# preprocess images to be in [-0.5, 0.5] range # preprocess images to be in [-0.5, 0.5] range
target_obs = utils.preprocess_obs(target_obs) target_obs = utils.preprocess_obs(target_obs)
rec_obs = self.decoder(h) rec_obs = self.decoder(h_clone)
rec_loss = F.mse_loss(target_obs, rec_obs) rec_loss = F.mse_loss(target_obs, rec_obs)
# add L2 penalty on latent representation ub_loss = torch.tensor(0.0)
# see https://arxiv.org/pdf/1903.12436.pdf #enc_loss = torch.tensor(0.0)
latent_loss = (0.5 * h.pow(2).sum(1)).mean() lb_loss = torch.tensor(0.0)
#rec_loss = torch.tensor(0.0)
loss = rec_loss + self.decoder_latent_lambda * latent_loss loss = rec_loss + enc_loss + lb_loss + ub_loss
self.encoder_optimizer.zero_grad() self.encoder_optimizer.zero_grad()
self.decoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad()
loss.backward() loss.backward()
self.encoder_optimizer.step() self.encoder_optimizer.step()
self.decoder_optimizer.step() self.decoder_optimizer.step()
#enc_loss = torch.tensor(0.0)
L.log('train_ae/ae_loss', loss, step) L.log('train_ae/ae_loss', loss, step)
L.log('train_ae/rec_loss', rec_loss, step)
L.log('train_ae/enc_loss', enc_loss, step)
L.log('train_ae/lb_loss', lb_loss, step)
L.log('train_ae/ub_loss', ub_loss, step)
self.decoder.log(L, step, log_freq=LOG_FREQ) self.decoder.log(L, step, log_freq=LOG_FREQ)
def update(self, replay_buffer, L, step): def update(self, replay_buffer, L, step):
obs, action, reward, next_obs, not_done = replay_buffer.sample() obs_list, action_list, reward_list, next_obs_list, not_done_list = replay_buffer.sample()
obs, action, reward, next_obs, not_done = obs_list[-1], action_list[-1], reward_list[-1], next_obs_list[-1], not_done_list[-1]
L.log('train/batch_reward', reward.mean(), step) L.log('train/batch_reward', reward.mean(), step)
@ -413,7 +480,7 @@ class SacAeAgent(object):
) )
if self.decoder is not None and step % self.decoder_update_freq == 0: if self.decoder is not None and step % self.decoder_update_freq == 0:
self.update_decoder(obs, obs, L, step) self.update_decoder(obs, obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list)
def save(self, model_dir, step): def save(self, model_dir, step):
torch.save( torch.save(

View File

@ -28,36 +28,40 @@ def parse_args():
parser.add_argument('--frame_stack', default=3, type=int) parser.add_argument('--frame_stack', default=3, type=int)
parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none']) parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none'])
parser.add_argument('--resource_files', type=str) parser.add_argument('--resource_files', type=str)
<<<<<<< HEAD
parser.add_argument('--resource_files_test', type=str)
=======
>>>>>>> origin/tester_1
parser.add_argument('--total_frames', default=10000, type=int) parser.add_argument('--total_frames', default=10000, type=int)
# replay buffer # replay buffer
parser.add_argument('--replay_buffer_capacity', default=1000000, type=int) parser.add_argument('--replay_buffer_capacity', default=100000, type=int)
# train # train
parser.add_argument('--agent', default='sac_ae', type=str) parser.add_argument('--agent', default='sac_ae', type=str)
parser.add_argument('--init_steps', default=1000, type=int) parser.add_argument('--init_steps', default=1000, type=int)
parser.add_argument('--num_train_steps', default=1000000, type=int) parser.add_argument('--num_train_steps', default=2000000, type=int)
parser.add_argument('--batch_size', default=128, type=int) parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--hidden_dim', default=1024, type=int) parser.add_argument('--hidden_dim', default=1024, type=int)
# eval # eval
parser.add_argument('--eval_freq', default=10000, type=int) parser.add_argument('--eval_freq', default=10000, type=int)
parser.add_argument('--num_eval_episodes', default=10, type=int) parser.add_argument('--num_eval_episodes', default=10, type=int)
# critic # critic
parser.add_argument('--critic_lr', default=1e-3, type=float) parser.add_argument('--critic_lr', default=1e-4, type=float)
parser.add_argument('--critic_beta', default=0.9, type=float) parser.add_argument('--critic_beta', default=0.9, type=float)
parser.add_argument('--critic_tau', default=0.01, type=float) parser.add_argument('--critic_tau', default=0.01, type=float)
parser.add_argument('--critic_target_update_freq', default=2, type=int) parser.add_argument('--critic_target_update_freq', default=2, type=int)
# actor # actor
parser.add_argument('--actor_lr', default=1e-3, type=float) parser.add_argument('--actor_lr', default=1e-4, type=float)
parser.add_argument('--actor_beta', default=0.9, type=float) parser.add_argument('--actor_beta', default=0.9, type=float)
parser.add_argument('--actor_log_std_min', default=-10, type=float) parser.add_argument('--actor_log_std_min', default=-10, type=float)
parser.add_argument('--actor_log_std_max', default=2, type=float) parser.add_argument('--actor_log_std_max', default=2, type=float)
parser.add_argument('--actor_update_freq', default=2, type=int) parser.add_argument('--actor_update_freq', default=2, type=int)
# encoder/decoder # encoder/decoder
parser.add_argument('--encoder_type', default='pixel', type=str) parser.add_argument('--encoder_type', default='pixel', type=str)
parser.add_argument('--encoder_feature_dim', default=50, type=int) parser.add_argument('--encoder_feature_dim', default=250, type=int)
parser.add_argument('--encoder_lr', default=1e-3, type=float) parser.add_argument('--encoder_lr', default=1e-4, type=float)
parser.add_argument('--encoder_tau', default=0.05, type=float) parser.add_argument('--encoder_tau', default=0.05, type=float)
parser.add_argument('--decoder_type', default='pixel', type=str) parser.add_argument('--decoder_type', default='pixel', type=str)
parser.add_argument('--decoder_lr', default=1e-3, type=float) parser.add_argument('--decoder_lr', default=1e-4, type=float)
parser.add_argument('--decoder_update_freq', default=1, type=int) parser.add_argument('--decoder_update_freq', default=1, type=int)
parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float) parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float)
parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float) parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float)
@ -153,9 +157,25 @@ def main():
) )
env.seed(args.seed) env.seed(args.seed)
env_test = dmc2gym.make(
domain_name=args.domain_name,
task_name=args.task_name,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat,
img_source=args.img_source,
resource_files=args.resource_files_test,
total_frames=args.total_frames
)
env_test.seed(args.seed)
# stack several consecutive frames together # stack several consecutive frames together
if args.encoder_type == 'pixel': if args.encoder_type == 'pixel':
env = utils.FrameStack(env, k=args.frame_stack) env = utils.FrameStack(env, k=args.frame_stack)
env_test = utils.FrameStack(env_test, k=args.frame_stack)
utils.make_dir(args.work_dir) utils.make_dir(args.work_dir)
video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
@ -202,7 +222,7 @@ def main():
# evaluate agent periodically # evaluate agent periodically
if step % args.eval_freq == 0: if step % args.eval_freq == 0:
L.log('eval/episode', episode, step) L.log('eval/episode', episode, step)
evaluate(env, agent, video, args.num_eval_episodes, L, step) evaluate(env_test, agent, video, args.num_eval_episodes, L, step)
if args.save_model: if args.save_model:
agent.save(model_dir, step) agent.save(model_dir, step)
if args.save_buffer: if args.save_buffer:

View File

@ -96,17 +96,17 @@ class ReplayBuffer(object):
self.full = self.full or self.idx == 0 self.full = self.full or self.idx == 0
def sample(self): def sample(self):
begin = 2
idxs = np.random.randint( idxs = np.random.randint(
0, self.capacity if self.full else self.idx, size=self.batch_size begin, self.capacity if self.full else self.idx, size=self.batch_size
) )
past_idxs = idxs - begin
obses = torch.as_tensor(self.obses[idxs], device=self.device).float() obses = torch.as_tensor(np.swapaxes(np.asarray([self.obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float()
actions = torch.as_tensor(self.actions[idxs], device=self.device) actions = torch.as_tensor(np.swapaxes(np.asarray([self.actions[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
rewards = torch.as_tensor(self.rewards[idxs], device=self.device) rewards = torch.as_tensor(np.swapaxes(np.asarray([self.rewards[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
next_obses = torch.as_tensor( next_obses = torch.as_tensor(np.swapaxes(np.asarray([self.next_obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float()
self.next_obses[idxs], device=self.device not_dones = torch.as_tensor(np.swapaxes(np.asarray([self.not_dones[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
).float()
not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device)
return obses, actions, rewards, next_obses, not_dones return obses, actions, rewards, next_obses, not_dones