Compare commits
No commits in common. "tester_1" and "master" have entirely different histories.
@ -1,4 +1,4 @@
|
|||||||
name: pytorch_sac_ae2
|
name: pytorch_sac_ae
|
||||||
channels:
|
channels:
|
||||||
- defaults
|
- defaults
|
||||||
dependencies:
|
dependencies:
|
||||||
|
169
encoder.py
169
encoder.py
@ -1,5 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
def tie_weights(src, trg):
|
def tie_weights(src, trg):
|
||||||
@ -10,85 +11,6 @@ def tie_weights(src, trg):
|
|||||||
|
|
||||||
OUT_DIM = {2: 39, 4: 35, 6: 31}
|
OUT_DIM = {2: 39, 4: 35, 6: 31}
|
||||||
|
|
||||||
'''
|
|
||||||
class PixelEncoder(nn.Module):
|
|
||||||
"""Convolutional encoder of pixels observations."""
|
|
||||||
def __init__(self, obs_shape, feature_dim, num_layers=2, num_filters=32):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
assert len(obs_shape) == 3
|
|
||||||
|
|
||||||
self.feature_dim = feature_dim
|
|
||||||
self.num_layers = num_layers
|
|
||||||
|
|
||||||
self.convs = nn.ModuleList(
|
|
||||||
[nn.Conv2d(obs_shape[0], num_filters, 3, stride=2)]
|
|
||||||
)
|
|
||||||
for i in range(num_layers - 1):
|
|
||||||
self.convs.append(nn.Conv2d(num_filters, num_filters, 3, stride=1))
|
|
||||||
|
|
||||||
out_dim = OUT_DIM[num_layers]
|
|
||||||
self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim)
|
|
||||||
self.ln = nn.LayerNorm(self.feature_dim)
|
|
||||||
|
|
||||||
self.outputs = dict()
|
|
||||||
|
|
||||||
def reparameterize(self, mu, logstd):
|
|
||||||
std = torch.exp(logstd)
|
|
||||||
eps = torch.randn_like(std)
|
|
||||||
return mu + eps * std
|
|
||||||
|
|
||||||
def forward_conv(self, obs):
|
|
||||||
obs = obs / 255.
|
|
||||||
self.outputs['obs'] = obs
|
|
||||||
|
|
||||||
conv = torch.relu(self.convs[0](obs))
|
|
||||||
self.outputs['conv1'] = conv
|
|
||||||
|
|
||||||
for i in range(1, self.num_layers):
|
|
||||||
conv = torch.relu(self.convs[i](conv))
|
|
||||||
self.outputs['conv%s' % (i + 1)] = conv
|
|
||||||
|
|
||||||
h = conv.view(conv.size(0), -1)
|
|
||||||
return h
|
|
||||||
|
|
||||||
def forward(self, obs, detach=False):
|
|
||||||
h = self.forward_conv(obs)
|
|
||||||
|
|
||||||
if detach:
|
|
||||||
h = h.detach()
|
|
||||||
|
|
||||||
h_fc = self.fc(h)
|
|
||||||
self.outputs['fc'] = h_fc
|
|
||||||
|
|
||||||
h_norm = self.ln(h_fc)
|
|
||||||
self.outputs['ln'] = h_norm
|
|
||||||
|
|
||||||
out = torch.tanh(h_norm)
|
|
||||||
self.outputs['tanh'] = out
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
def copy_conv_weights_from(self, source):
|
|
||||||
"""Tie convolutional layers"""
|
|
||||||
# only tie conv layers
|
|
||||||
for i in range(self.num_layers):
|
|
||||||
tie_weights(src=source.convs[i], trg=self.convs[i])
|
|
||||||
|
|
||||||
def log(self, L, step, log_freq):
|
|
||||||
if step % log_freq != 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
for k, v in self.outputs.items():
|
|
||||||
L.log_histogram('train_encoder/%s_hist' % k, v, step)
|
|
||||||
if len(v.shape) > 2:
|
|
||||||
L.log_image('train_encoder/%s_img' % k, v[0], step)
|
|
||||||
|
|
||||||
for i in range(self.num_layers):
|
|
||||||
L.log_param('train_encoder/conv%s' % (i + 1), self.convs[i], step)
|
|
||||||
L.log_param('train_encoder/fc', self.fc, step)
|
|
||||||
L.log_param('train_encoder/ln', self.ln, step)
|
|
||||||
'''
|
|
||||||
|
|
||||||
class PixelEncoder(nn.Module):
|
class PixelEncoder(nn.Module):
|
||||||
"""Convolutional encoder of pixels observations."""
|
"""Convolutional encoder of pixels observations."""
|
||||||
@ -109,7 +31,6 @@ class PixelEncoder(nn.Module):
|
|||||||
out_dim = OUT_DIM[num_layers]
|
out_dim = OUT_DIM[num_layers]
|
||||||
self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim * 2)
|
self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim * 2)
|
||||||
self.ln = nn.LayerNorm(self.feature_dim * 2)
|
self.ln = nn.LayerNorm(self.feature_dim * 2)
|
||||||
self.combine = nn.Linear(self.feature_dim + 6, self.feature_dim)
|
|
||||||
|
|
||||||
self.outputs = dict()
|
self.outputs = dict()
|
||||||
|
|
||||||
@ -144,16 +65,16 @@ class PixelEncoder(nn.Module):
|
|||||||
h_norm = self.ln(h_fc)
|
h_norm = self.ln(h_fc)
|
||||||
self.outputs['ln'] = h_norm
|
self.outputs['ln'] = h_norm
|
||||||
|
|
||||||
#out = torch.tanh(h_norm)
|
h_tan = torch.tanh(h_norm)
|
||||||
|
|
||||||
mu, logstd = torch.chunk(h_norm, 2, dim=-1)
|
mu, logstd = torch.chunk(h_tan, 2, dim=-1)
|
||||||
logstd = torch.tanh(logstd)
|
|
||||||
self.outputs['mu'] = mu
|
self.outputs['mu'] = mu
|
||||||
self.outputs['logstd'] = logstd
|
self.outputs['logstd'] = logstd
|
||||||
self.outputs['std'] = logstd.exp()
|
|
||||||
|
std = torch.tanh(h_norm)
|
||||||
|
self.outputs['std'] = std
|
||||||
|
|
||||||
out = self.reparameterize(mu, logstd)
|
out = self.reparameterize(mu, logstd)
|
||||||
self.outputs['tanh'] = out
|
|
||||||
return out, mu, logstd
|
return out, mu, logstd
|
||||||
|
|
||||||
def copy_conv_weights_from(self, source):
|
def copy_conv_weights_from(self, source):
|
||||||
@ -176,6 +97,7 @@ class PixelEncoder(nn.Module):
|
|||||||
L.log_param('train_encoder/fc', self.fc, step)
|
L.log_param('train_encoder/fc', self.fc, step)
|
||||||
L.log_param('train_encoder/ln', self.ln, step)
|
L.log_param('train_encoder/ln', self.ln, step)
|
||||||
|
|
||||||
|
|
||||||
class IdentityEncoder(nn.Module):
|
class IdentityEncoder(nn.Module):
|
||||||
def __init__(self, obs_shape, feature_dim, num_layers, num_filters):
|
def __init__(self, obs_shape, feature_dim, num_layers, num_filters):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -193,27 +115,6 @@ class IdentityEncoder(nn.Module):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
_AVAILABLE_ENCODERS = {'pixel': PixelEncoder, 'identity': IdentityEncoder}
|
|
||||||
|
|
||||||
|
|
||||||
def make_encoder(
|
|
||||||
encoder_type, obs_shape, feature_dim, num_layers, num_filters
|
|
||||||
):
|
|
||||||
assert encoder_type in _AVAILABLE_ENCODERS
|
|
||||||
return _AVAILABLE_ENCODERS[encoder_type](
|
|
||||||
obs_shape, feature_dim, num_layers, num_filters
|
|
||||||
)
|
|
||||||
|
|
||||||
def club_loss(x_samples, x_mu, x_logvar, y_samples):
|
|
||||||
sample_size = x_samples.shape[0]
|
|
||||||
random_index = torch.randperm(sample_size).long()
|
|
||||||
|
|
||||||
positive = -(x_mu - y_samples)**2 / x_logvar.exp()
|
|
||||||
negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp()
|
|
||||||
upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
|
|
||||||
return upper_bound/2.
|
|
||||||
|
|
||||||
|
|
||||||
class TransitionModel(nn.Module):
|
class TransitionModel(nn.Module):
|
||||||
def __init__(self, state_size, hidden_size, action_size, history_size):
|
def __init__(self, state_size, hidden_size, action_size, history_size):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -225,11 +126,13 @@ class TransitionModel(nn.Module):
|
|||||||
self.act_fn = nn.ELU()
|
self.act_fn = nn.ELU()
|
||||||
|
|
||||||
self.fc_state_action = nn.Linear(state_size + action_size, hidden_size)
|
self.fc_state_action = nn.Linear(state_size + action_size, hidden_size)
|
||||||
self.fc_hidden = nn.Linear(hidden_size, hidden_size)
|
|
||||||
self.history_cell = nn.GRUCell(hidden_size, history_size)
|
self.history_cell = nn.GRUCell(hidden_size, history_size)
|
||||||
self.fc_state_mu = nn.Linear(history_size + hidden_size, state_size)
|
self.fc_state_mu = nn.Linear(history_size + hidden_size, state_size)
|
||||||
self.fc_state_sigma = nn.Linear(history_size + hidden_size, state_size)
|
self.fc_state_sigma = nn.Linear(history_size + hidden_size, state_size)
|
||||||
|
|
||||||
|
self.batch_norm = nn.BatchNorm1d(hidden_size)
|
||||||
|
self.batch_norm2 = nn.BatchNorm1d(state_size)
|
||||||
|
|
||||||
self.min_sigma = 1e-4
|
self.min_sigma = 1e-4
|
||||||
self.max_sigma = 1e0
|
self.max_sigma = 1e0
|
||||||
|
|
||||||
@ -240,6 +143,7 @@ class TransitionModel(nn.Module):
|
|||||||
|
|
||||||
def get_dist(self, mean, std):
|
def get_dist(self, mean, std):
|
||||||
distribution = torch.distributions.Normal(mean, std)
|
distribution = torch.distributions.Normal(mean, std)
|
||||||
|
distribution = torch.distributions.independent.Independent(distribution, 1)
|
||||||
return distribution
|
return distribution
|
||||||
|
|
||||||
def stack_states(self, states, dim=0):
|
def stack_states(self, states, dim=0):
|
||||||
@ -257,21 +161,29 @@ class TransitionModel(nn.Module):
|
|||||||
return dict(
|
return dict(
|
||||||
sample = torch.reshape(state[name], (state[name].shape[0]* state[name].shape[1], *state[name].shape[2:])))
|
sample = torch.reshape(state[name], (state[name].shape[0]* state[name].shape[1], *state[name].shape[2:])))
|
||||||
|
|
||||||
def transition_step(self, state, action, hist, not_done):
|
def transition_step(self, prev_state, prev_action, prev_hist, prev_not_done):
|
||||||
state = state * not_done
|
prev_state = prev_state.detach() * prev_not_done
|
||||||
hist = hist * not_done
|
prev_hist = prev_hist * prev_not_done
|
||||||
|
|
||||||
state_action_enc = self.act_fn(self.fc_state_action(torch.cat([state, action], dim=-1)))
|
state_action_enc = self.fc_state_action(torch.cat([prev_state, prev_action], dim=-1))
|
||||||
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
|
state_action_enc = self.act_fn(self.batch_norm(state_action_enc))
|
||||||
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
|
|
||||||
state_action_enc = self.act_fn(self.fc_hidden(state_action_enc))
|
|
||||||
|
|
||||||
current_hist = self.history_cell(state_action_enc, hist)
|
current_hist = self.history_cell(state_action_enc, prev_hist)
|
||||||
next_state_mu = self.act_fn(self.fc_state_mu(torch.cat([state_action_enc, hist], dim=-1)))
|
state_mu = self.act_fn(self.fc_state_mu(torch.cat([state_action_enc, prev_hist], dim=-1)))
|
||||||
next_state_sigma = torch.tanh(self.fc_state_sigma(torch.cat([state_action_enc, hist], dim=-1)))
|
state_sigma = F.softplus(self.fc_state_sigma(torch.cat([state_action_enc, prev_hist], dim=-1)))
|
||||||
next_state = next_state_mu + torch.randn_like(next_state_mu) * next_state_sigma.exp()
|
sample_state = state_mu + torch.randn_like(state_mu) * state_sigma
|
||||||
|
|
||||||
state_enc = {"mean": next_state_mu, "logvar": next_state_sigma, "sample": next_state, "history": current_hist}
|
state_enc = {"mean": state_mu, "std": state_sigma, "sample": sample_state, "history": current_hist}
|
||||||
|
return state_enc
|
||||||
|
|
||||||
|
def observe_step(self, prev_state, prev_action, prev_history):
|
||||||
|
state_action_enc = self.act_fn(self.batch_norm(self.fc_state_action(torch.cat([prev_state, prev_action], dim=-1))))
|
||||||
|
current_history = self.history_cell(state_action_enc, prev_history)
|
||||||
|
state_mu = self.act_fn(self.batch_norm2(self.fc_state_mu(torch.cat([state_action_enc, prev_history], dim=-1))))
|
||||||
|
state_sigma = F.softplus(self.fc_state_sigma(torch.cat([state_action_enc, prev_history], dim=-1)))
|
||||||
|
|
||||||
|
sample_state = state_mu + torch.randn_like(state_mu) * state_sigma
|
||||||
|
state_enc = {"mean": state_mu, "std": state_sigma, "sample": sample_state, "history": current_history}
|
||||||
return state_enc
|
return state_enc
|
||||||
|
|
||||||
def observe_rollout(self, rollout_states, rollout_actions, init_history, nonterms):
|
def observe_rollout(self, rollout_states, rollout_actions, init_history, nonterms):
|
||||||
@ -286,13 +198,11 @@ class TransitionModel(nn.Module):
|
|||||||
observed_rollout = self.stack_states(observed_rollout, dim=0)
|
observed_rollout = self.stack_states(observed_rollout, dim=0)
|
||||||
return observed_rollout
|
return observed_rollout
|
||||||
|
|
||||||
def forward(self, state, action, hist, not_done):
|
def reparemeterize(self, mean, std):
|
||||||
return self.transition_step(state, action, hist, not_done)
|
|
||||||
|
|
||||||
def reparameterize(self, mean, std):
|
|
||||||
eps = torch.randn_like(mean)
|
eps = torch.randn_like(mean)
|
||||||
return mean + eps * std
|
return mean + eps * std
|
||||||
|
|
||||||
|
|
||||||
def club_loss(x_samples, x_mu, x_logvar, y_samples):
|
def club_loss(x_samples, x_mu, x_logvar, y_samples):
|
||||||
sample_size = x_samples.shape[0]
|
sample_size = x_samples.shape[0]
|
||||||
random_index = torch.randperm(sample_size).long()
|
random_index = torch.randperm(sample_size).long()
|
||||||
@ -300,4 +210,15 @@ def club_loss(x_samples, x_mu, x_logvar, y_samples):
|
|||||||
positive = -(x_mu - y_samples)**2 / x_logvar.exp()
|
positive = -(x_mu - y_samples)**2 / x_logvar.exp()
|
||||||
negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp()
|
negative = - (x_mu - y_samples[random_index])**2 / x_logvar.exp()
|
||||||
upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
|
upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
|
||||||
return upper_bound/2.0
|
return upper_bound/2.
|
||||||
|
|
||||||
|
_AVAILABLE_ENCODERS = {'pixel': PixelEncoder, 'identity': IdentityEncoder}
|
||||||
|
|
||||||
|
|
||||||
|
def make_encoder(
|
||||||
|
encoder_type, obs_shape, feature_dim, num_layers, num_filters
|
||||||
|
):
|
||||||
|
assert encoder_type in _AVAILABLE_ENCODERS
|
||||||
|
return _AVAILABLE_ENCODERS[encoder_type](
|
||||||
|
obs_shape, feature_dim, num_layers, num_filters
|
||||||
|
)
|
||||||
|
@ -1,86 +0,0 @@
|
|||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
|
|
||||||
|
|
||||||
"""
|
|
||||||
def tabulate_events(dpath):
|
|
||||||
files = os.listdir(dpath)[0]
|
|
||||||
summary_iterators = [EventAccumulator(os.path.join(dpath, files)).Reload()]
|
|
||||||
|
|
||||||
tags = summary_iterators[0].Tags()['scalars']
|
|
||||||
|
|
||||||
for it in summary_iterators:
|
|
||||||
assert it.Tags()['scalars'] == tags
|
|
||||||
|
|
||||||
out = {t: [] for t in tags}
|
|
||||||
steps = []
|
|
||||||
|
|
||||||
for tag in tags:
|
|
||||||
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
|
|
||||||
|
|
||||||
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
|
|
||||||
assert len(set(e.step for e in events)) == 1
|
|
||||||
|
|
||||||
out[tag].append([e.value for e in events])
|
|
||||||
|
|
||||||
return out, steps
|
|
||||||
|
|
||||||
events, steps = tabulate_events('/home/vedant/pytorch_sac_ae/log/runs')
|
|
||||||
|
|
||||||
data = []
|
|
||||||
|
|
||||||
for tag, values in events.items():
|
|
||||||
for run_idx, run_values in enumerate(values):
|
|
||||||
for step_idx, value in enumerate(run_values):
|
|
||||||
data.append({
|
|
||||||
'tag': tag,
|
|
||||||
'run': run_idx,
|
|
||||||
'step': steps[step_idx],
|
|
||||||
'value': value,
|
|
||||||
})
|
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
print(df.head())
|
|
||||||
exit()
|
|
||||||
|
|
||||||
plt.figure(figsize=(10,6))
|
|
||||||
sns.lineplot(data=df, x='step', y='value', hue='tag', ci='sd')
|
|
||||||
plt.show()
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
from tensorboard.backend.event_processing import event_accumulator
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def data_from_tb(files):
|
|
||||||
all_steps, all_rewards = [], []
|
|
||||||
for file in files:
|
|
||||||
ea = event_accumulator.EventAccumulator(file, size_guidance={'scalars': 0})
|
|
||||||
ea.Reload()
|
|
||||||
|
|
||||||
episode_rewards = ea.Scalars('train/episode_reward')
|
|
||||||
steps = [event.step for event in episode_rewards][:990000]
|
|
||||||
rewards = [event.value for event in episode_rewards][:990000]
|
|
||||||
all_steps.append(steps)
|
|
||||||
all_rewards.append(rewards)
|
|
||||||
return all_steps, all_rewards
|
|
||||||
|
|
||||||
|
|
||||||
files = ['/home/vedant/pytorch_sac_ae/log/runs/tb_21_05_2023-13_19_36/events.out.tfevents.1684667976.cpswkstn6-nvidia4090.1749060.0',
|
|
||||||
'/home/vedant/pytorch_sac_ae/log/runs/tb_22_05_2023-09_56_30/events.out.tfevents.1684742190.cpswkstn6-nvidia4090.1976229.0']
|
|
||||||
|
|
||||||
all_steps, all_rewards = data_from_tb(files)
|
|
||||||
mean_rewards = np.mean(all_rewards, axis=0)
|
|
||||||
std_rewards = np.std(all_rewards, axis=0)
|
|
||||||
mean_steps = np.mean(all_steps, axis=0)
|
|
||||||
|
|
||||||
df = pd.DataFrame({'Steps': mean_steps,'Rewards': mean_rewards,'Standard Deviation': std_rewards})
|
|
||||||
|
|
||||||
sns.relplot(x='Steps', y='Rewards', kind='line', data=df, ci="sd")
|
|
||||||
plt.fill_between(df['Steps'], df['Rewards'] - df['Standard Deviation'], df['Rewards'] + df['Standard Deviation'], color='b', alpha=.1)
|
|
||||||
plt.title("Mean Rewards vs Steps with Standard Deviation")
|
|
||||||
plt.show()
|
|
158
sac_ae.py
158
sac_ae.py
@ -70,10 +70,8 @@ class Actor(nn.Module):
|
|||||||
self.outputs = dict()
|
self.outputs = dict()
|
||||||
self.apply(weight_init)
|
self.apply(weight_init)
|
||||||
|
|
||||||
def forward(
|
def forward(self, obs, compute_pi=True, compute_log_pi=True, detach_encoder=False):
|
||||||
self, obs, compute_pi=True, compute_log_pi=True, detach_encoder=False
|
obs, _, _ = self.encoder(obs, detach=detach_encoder)
|
||||||
):
|
|
||||||
obs,_,_ = self.encoder(obs, detach=detach_encoder)
|
|
||||||
|
|
||||||
mu, log_std = self.trunk(obs).chunk(2, dim=-1)
|
mu, log_std = self.trunk(obs).chunk(2, dim=-1)
|
||||||
|
|
||||||
@ -100,7 +98,6 @@ class Actor(nn.Module):
|
|||||||
log_pi = None
|
log_pi = None
|
||||||
|
|
||||||
mu, pi, log_pi = squash(mu, pi, log_pi)
|
mu, pi, log_pi = squash(mu, pi, log_pi)
|
||||||
|
|
||||||
return mu, pi, log_pi, log_std
|
return mu, pi, log_pi, log_std
|
||||||
|
|
||||||
def log(self, L, step, log_freq=LOG_FREQ):
|
def log(self, L, step, log_freq=LOG_FREQ):
|
||||||
@ -159,7 +156,7 @@ class Critic(nn.Module):
|
|||||||
|
|
||||||
def forward(self, obs, action, detach_encoder=False):
|
def forward(self, obs, action, detach_encoder=False):
|
||||||
# detach_encoder allows to stop gradient propogation to encoder
|
# detach_encoder allows to stop gradient propogation to encoder
|
||||||
obs,_,_ = self.encoder(obs, detach=detach_encoder)
|
obs, _ , _ = self.encoder(obs, detach=detach_encoder)
|
||||||
|
|
||||||
q1 = self.Q1(obs, action)
|
q1 = self.Q1(obs, action)
|
||||||
q2 = self.Q2(obs, action)
|
q2 = self.Q2(obs, action)
|
||||||
@ -182,12 +179,39 @@ class Critic(nn.Module):
|
|||||||
L.log_param('train_critic/q1_fc%d' % i, self.Q1.trunk[i * 2], step)
|
L.log_param('train_critic/q1_fc%d' % i, self.Q1.trunk[i * 2], step)
|
||||||
L.log_param('train_critic/q2_fc%d' % i, self.Q2.trunk[i * 2], step)
|
L.log_param('train_critic/q2_fc%d' % i, self.Q2.trunk[i * 2], step)
|
||||||
|
|
||||||
class LBLoss(nn.Module):
|
class CURL(nn.Module):
|
||||||
def __init__(self, z_dim):
|
"""
|
||||||
super(LBLoss, self).__init__()
|
CURL
|
||||||
self.z_dim = z_dim
|
"""
|
||||||
|
|
||||||
|
def __init__(self, obs_shape, z_dim, a_dim, batch_size, critic, critic_target, output_type="continuous"):
|
||||||
|
super(CURL, self).__init__()
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
self.encoder = critic.encoder
|
||||||
|
|
||||||
|
self.encoder_target = critic_target.encoder
|
||||||
|
|
||||||
self.W = nn.Parameter(torch.rand(z_dim, z_dim))
|
self.W = nn.Parameter(torch.rand(z_dim, z_dim))
|
||||||
|
self.combine = nn.Linear(z_dim + a_dim, z_dim)
|
||||||
|
self.output_type = output_type
|
||||||
|
|
||||||
|
def encode(self, x, a=None, detach=False, ema=False):
|
||||||
|
"""
|
||||||
|
Encoder: z_t = e(x_t)
|
||||||
|
:param x: x_t, x y coordinates
|
||||||
|
:return: z_t, value in r2
|
||||||
|
"""
|
||||||
|
if ema:
|
||||||
|
with torch.no_grad():
|
||||||
|
z_out = self.encoder_target(x)[0]
|
||||||
|
z_out = self.combine(torch.concat((z_out,a), dim=-1))
|
||||||
|
else:
|
||||||
|
z_out = self.encoder(x)[0]
|
||||||
|
|
||||||
|
if detach:
|
||||||
|
z_out = z_out.detach()
|
||||||
|
return z_out
|
||||||
|
|
||||||
def compute_logits(self, z_a, z_pos):
|
def compute_logits(self, z_a, z_pos):
|
||||||
"""
|
"""
|
||||||
@ -202,7 +226,6 @@ class LBLoss(nn.Module):
|
|||||||
logits = logits - torch.max(logits, 1)[0][:, None]
|
logits = logits - torch.max(logits, 1)[0][:, None]
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
class SacAeAgent(object):
|
class SacAeAgent(object):
|
||||||
"""SAC+AE algorithm."""
|
"""SAC+AE algorithm."""
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -245,6 +268,12 @@ class SacAeAgent(object):
|
|||||||
self.decoder_update_freq = decoder_update_freq
|
self.decoder_update_freq = decoder_update_freq
|
||||||
self.decoder_latent_lambda = decoder_latent_lambda
|
self.decoder_latent_lambda = decoder_latent_lambda
|
||||||
|
|
||||||
|
self.transition_model = TransitionModel(
|
||||||
|
encoder_feature_dim,
|
||||||
|
hidden_dim,
|
||||||
|
action_shape[0],
|
||||||
|
encoder_feature_dim).to(device)
|
||||||
|
|
||||||
self.actor = Actor(
|
self.actor = Actor(
|
||||||
obs_shape, action_shape, hidden_dim, encoder_type,
|
obs_shape, action_shape, hidden_dim, encoder_type,
|
||||||
encoder_feature_dim, actor_log_std_min, actor_log_std_max,
|
encoder_feature_dim, actor_log_std_min, actor_log_std_max,
|
||||||
@ -261,12 +290,6 @@ class SacAeAgent(object):
|
|||||||
encoder_feature_dim, num_layers, num_filters
|
encoder_feature_dim, num_layers, num_filters
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
self.transition_model = TransitionModel(
|
|
||||||
encoder_feature_dim, hidden_dim, action_shape[0], history_size=256
|
|
||||||
).to(device)
|
|
||||||
|
|
||||||
self.lb_loss = LBLoss(encoder_feature_dim).to(device)
|
|
||||||
|
|
||||||
self.critic_target.load_state_dict(self.critic.state_dict())
|
self.critic_target.load_state_dict(self.critic.state_dict())
|
||||||
|
|
||||||
# tie encoders between actor and critic
|
# tie encoders between actor and critic
|
||||||
@ -277,6 +300,11 @@ class SacAeAgent(object):
|
|||||||
# set target entropy to -|A|
|
# set target entropy to -|A|
|
||||||
self.target_entropy = -np.prod(action_shape)
|
self.target_entropy = -np.prod(action_shape)
|
||||||
|
|
||||||
|
self.CURL = CURL(obs_shape, encoder_feature_dim, action_shape[0],
|
||||||
|
obs_shape[0], self.critic,self.critic_target, output_type='continuous').to(self.device)
|
||||||
|
|
||||||
|
self.cross_entropy_loss = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
self.decoder = None
|
self.decoder = None
|
||||||
if decoder_type != 'identity':
|
if decoder_type != 'identity':
|
||||||
# create decoder
|
# create decoder
|
||||||
@ -288,10 +316,7 @@ class SacAeAgent(object):
|
|||||||
|
|
||||||
# optimizer for critic encoder for reconstruction loss
|
# optimizer for critic encoder for reconstruction loss
|
||||||
self.encoder_optimizer = torch.optim.Adam(
|
self.encoder_optimizer = torch.optim.Adam(
|
||||||
list(self.critic.encoder.parameters()) +
|
self.critic.encoder.parameters(), lr=encoder_lr
|
||||||
list(self.transition_model.parameters()), #+
|
|
||||||
#list(self.lb_loss.parameters()),
|
|
||||||
lr=encoder_lr
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# optimizer for decoder
|
# optimizer for decoder
|
||||||
@ -310,6 +335,10 @@ class SacAeAgent(object):
|
|||||||
self.critic.parameters(), lr=critic_lr, betas=(critic_beta, 0.999)
|
self.critic.parameters(), lr=critic_lr, betas=(critic_beta, 0.999)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.cpc_optimizer = torch.optim.Adam(
|
||||||
|
self.CURL.parameters(), lr=encoder_lr
|
||||||
|
)
|
||||||
|
|
||||||
self.log_alpha_optimizer = torch.optim.Adam(
|
self.log_alpha_optimizer = torch.optim.Adam(
|
||||||
[self.log_alpha], lr=alpha_lr, betas=(alpha_beta, 0.999)
|
[self.log_alpha], lr=alpha_lr, betas=(alpha_beta, 0.999)
|
||||||
)
|
)
|
||||||
@ -358,7 +387,6 @@ class SacAeAgent(object):
|
|||||||
target_Q) + F.mse_loss(current_Q2, target_Q)
|
target_Q) + F.mse_loss(current_Q2, target_Q)
|
||||||
L.log('train_critic/loss', critic_loss, step)
|
L.log('train_critic/loss', critic_loss, step)
|
||||||
|
|
||||||
|
|
||||||
# Optimize the critic
|
# Optimize the critic
|
||||||
self.critic_optimizer.zero_grad()
|
self.critic_optimizer.zero_grad()
|
||||||
critic_loss.backward()
|
critic_loss.backward()
|
||||||
@ -395,76 +423,74 @@ class SacAeAgent(object):
|
|||||||
alpha_loss.backward()
|
alpha_loss.backward()
|
||||||
self.log_alpha_optimizer.step()
|
self.log_alpha_optimizer.step()
|
||||||
|
|
||||||
def update_decoder(self, obs, target_obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list):
|
def update_decoder(self, last_obs, last_action, last_reward, curr_obs, last_not_done, action, reward, next_obs, not_done, target_obs, L, step):
|
||||||
with torch.no_grad():
|
h_curr, mu_h_curr, std_h_curr = self.critic.encoder(curr_obs)
|
||||||
hist = torch.zeros((target_obs.shape[0], 256)).to(self.device)
|
|
||||||
for i in range(len(obs_list)-1):
|
|
||||||
state, _, _ = self.critic.encoder(obs_list[i])
|
|
||||||
action = action_list[i]
|
|
||||||
not_done = not_done_list[i]
|
|
||||||
state_enc = self.transition_model(state, action, hist, not_done)
|
|
||||||
hist = state_enc["history"]
|
|
||||||
|
|
||||||
h, h_mu, h_logvar = self.critic.encoder(obs_list[-1])
|
|
||||||
h_clone = h.clone()
|
|
||||||
|
|
||||||
action = action_list[-1]
|
|
||||||
not_done = not_done_list[-1]
|
|
||||||
state_enc = self.transition_model(h, action, hist, not_done)
|
|
||||||
mean, std = state_enc["mean"], state_enc["logvar"].exp()
|
|
||||||
|
|
||||||
h_dist_enc = torch.distributions.Normal(h_mu, h_logvar.exp())
|
|
||||||
h_dist_pred = torch.distributions.Normal(mean, std)
|
|
||||||
enc_loss = torch.distributions.kl.kl_divergence(h_dist_enc, h_dist_pred).mean() * 1e-2
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
z_pos, _ , _ = self.critic_target.encoder(next_obs_list[-1])
|
h_last, _, _ = self.critic.encoder(last_obs)
|
||||||
z_out = self.critic_target.encoder.combine(torch.concat((z_pos, action), dim=-1))
|
self.transition_model.init_states(last_obs.shape[0], self.device)
|
||||||
logits = self.lb_loss.compute_logits(h, z_out)
|
curr_state = self.transition_model.transition_step(h_last, last_action, self.transition_model.prev_history, last_not_done)
|
||||||
|
|
||||||
|
hist = curr_state["history"]
|
||||||
|
next_state = self.transition_model.transition_step(h_curr, action, hist, not_done)
|
||||||
|
|
||||||
|
next_state_mu = next_state["mean"]
|
||||||
|
next_state_sigma = next_state["std"]
|
||||||
|
next_state_sample = next_state["sample"]
|
||||||
|
pred_dist = torch.distributions.Normal(next_state_mu, next_state_sigma)
|
||||||
|
|
||||||
|
h, mu_h_next, logstd_h_next = self.critic.encoder(next_obs)
|
||||||
|
std_h_next = torch.exp(logstd_h_next)
|
||||||
|
enc_dist = torch.distributions.Normal(mu_h_next, std_h_next)
|
||||||
|
enc_loss = torch.mean(torch.distributions.kl.kl_divergence(enc_dist,pred_dist)) * 0.1
|
||||||
|
|
||||||
|
z_pos = self.CURL.encode(next_obs, action.detach(), ema=True)
|
||||||
|
logits = self.CURL.compute_logits(h_curr, z_pos)
|
||||||
labels = torch.arange(logits.shape[0]).long().to(self.device)
|
labels = torch.arange(logits.shape[0]).long().to(self.device)
|
||||||
lb_loss = nn.CrossEntropyLoss()(logits, labels) * 1e-2
|
lb_loss = self.cross_entropy_loss(logits, labels) * 0.1
|
||||||
|
|
||||||
#with torch.no_grad():
|
ub_loss = club_loss(h, mu_h_next, logstd_h_next, next_state_sample) * 0.1
|
||||||
# z_pos, _ , _ = self.critic.encoder(next_obs_list[-1])
|
|
||||||
#ub_loss = club_loss(state_enc["sample"], mean, state_enc["logvar"], h) * 1e-1
|
|
||||||
|
|
||||||
if target_obs.dim() == 4:
|
if target_obs.dim() == 4:
|
||||||
# preprocess images to be in [-0.5, 0.5] range
|
# preprocess images to be in [-0.5, 0.5] range
|
||||||
target_obs = utils.preprocess_obs(target_obs)
|
target_obs = utils.preprocess_obs(target_obs)
|
||||||
rec_obs = self.decoder(h_clone)
|
|
||||||
|
rec_obs = self.decoder(h)
|
||||||
rec_loss = F.mse_loss(target_obs, rec_obs)
|
rec_loss = F.mse_loss(target_obs, rec_obs)
|
||||||
|
|
||||||
ub_loss = torch.tensor(0.0)
|
# add L2 penalty on latent representation
|
||||||
#enc_loss = torch.tensor(0.0)
|
# see https://arxiv.org/pdf/1903.12436.pdf
|
||||||
#lb_loss = torch.tensor(0.0)
|
latent_loss = (0.5 * h.pow(2).sum(1)).mean()
|
||||||
#rec_loss = torch.tensor(0.0)
|
|
||||||
loss = rec_loss + enc_loss + lb_loss + ub_loss
|
loss = rec_loss + enc_loss + lb_loss + ub_loss #self.decoder_latent_lambda * latent_loss
|
||||||
self.encoder_optimizer.zero_grad()
|
self.encoder_optimizer.zero_grad()
|
||||||
self.decoder_optimizer.zero_grad()
|
self.decoder_optimizer.zero_grad()
|
||||||
|
self.cpc_optimizer.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
self.encoder_optimizer.step()
|
self.encoder_optimizer.step()
|
||||||
self.decoder_optimizer.step()
|
self.decoder_optimizer.step()
|
||||||
|
self.cpc_optimizer.step()
|
||||||
#enc_loss = torch.tensor(0.0)
|
|
||||||
L.log('train_ae/ae_loss', loss, step)
|
L.log('train_ae/ae_loss', loss, step)
|
||||||
L.log('train_ae/rec_loss', rec_loss, step)
|
|
||||||
L.log('train_ae/enc_loss', enc_loss, step)
|
|
||||||
L.log('train_ae/lb_loss', lb_loss, step)
|
L.log('train_ae/lb_loss', lb_loss, step)
|
||||||
L.log('train_ae/ub_loss', ub_loss, step)
|
L.log('train_ae/ub_loss', ub_loss, step)
|
||||||
|
L.log('train_ae/enc_loss', enc_loss, step)
|
||||||
|
L.log('train_ae/dec_loss', rec_loss, step)
|
||||||
|
|
||||||
self.decoder.log(L, step, log_freq=LOG_FREQ)
|
self.decoder.log(L, step, log_freq=LOG_FREQ)
|
||||||
|
|
||||||
def update(self, replay_buffer, L, step):
|
def update(self, replay_buffer, L, step):
|
||||||
obs_list, action_list, reward_list, next_obs_list, not_done_list = replay_buffer.sample()
|
last_obs, last_action, last_reward, curr_obs, last_not_done, action, reward, next_obs, not_done = replay_buffer.sample()
|
||||||
obs, action, reward, next_obs, not_done = obs_list[-1], action_list[-1], reward_list[-1], next_obs_list[-1], not_done_list[-1]
|
#obs, action, reward, next_obs, not_done = replay_buffer.sample()
|
||||||
|
|
||||||
L.log('train/batch_reward', reward.mean(), step)
|
L.log('train/batch_reward', last_reward.mean(), step)
|
||||||
|
|
||||||
self.update_critic(obs, action, reward, next_obs, not_done, L, step)
|
#self.update_critic(last_obs, last_action, last_reward, curr_obs, last_not_done, L, step)
|
||||||
|
self.update_critic(curr_obs, action, reward, next_obs, not_done, L, step)
|
||||||
|
|
||||||
if step % self.actor_update_freq == 0:
|
if step % self.actor_update_freq == 0:
|
||||||
self.update_actor_and_alpha(obs, L, step)
|
#self.update_actor_and_alpha(last_obs, L, step)
|
||||||
|
self.update_actor_and_alpha(curr_obs, L, step)
|
||||||
|
|
||||||
if step % self.critic_target_update_freq == 0:
|
if step % self.critic_target_update_freq == 0:
|
||||||
utils.soft_update_params(
|
utils.soft_update_params(
|
||||||
@ -479,7 +505,7 @@ class SacAeAgent(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.decoder is not None and step % self.decoder_update_freq == 0:
|
if self.decoder is not None and step % self.decoder_update_freq == 0:
|
||||||
self.update_decoder(obs, obs, L, step, obs_list, action_list, reward_list, next_obs_list, not_done_list)
|
self.update_decoder(last_obs, last_action, last_reward, curr_obs, last_not_done, action, reward, next_obs, not_done, next_obs, L, step)
|
||||||
|
|
||||||
def save(self, model_dir, step):
|
def save(self, model_dir, step):
|
||||||
torch.save(
|
torch.save(
|
||||||
|
84
train.py
84
train.py
@ -28,37 +28,36 @@ def parse_args():
|
|||||||
parser.add_argument('--frame_stack', default=3, type=int)
|
parser.add_argument('--frame_stack', default=3, type=int)
|
||||||
parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none'])
|
parser.add_argument('--img_source', default=None, type=str, choices=['color', 'noise', 'images', 'video', 'none'])
|
||||||
parser.add_argument('--resource_files', type=str)
|
parser.add_argument('--resource_files', type=str)
|
||||||
parser.add_argument('--resource_files_test', type=str)
|
|
||||||
parser.add_argument('--total_frames', default=10000, type=int)
|
parser.add_argument('--total_frames', default=10000, type=int)
|
||||||
# replay buffer
|
# replay buffer
|
||||||
parser.add_argument('--replay_buffer_capacity', default=100000, type=int)
|
parser.add_argument('--replay_buffer_capacity', default=1000000, type=int)
|
||||||
# train
|
# train
|
||||||
parser.add_argument('--agent', default='sac_ae', type=str)
|
parser.add_argument('--agent', default='sac_ae', type=str)
|
||||||
parser.add_argument('--init_steps', default=1000, type=int)
|
parser.add_argument('--init_steps', default=1000, type=int)
|
||||||
parser.add_argument('--num_train_steps', default=2000000, type=int)
|
parser.add_argument('--num_train_steps', default=1000000, type=int)
|
||||||
parser.add_argument('--batch_size', default=32, type=int)
|
parser.add_argument('--batch_size', default=512, type=int)
|
||||||
parser.add_argument('--hidden_dim', default=1024, type=int)
|
parser.add_argument('--hidden_dim', default=1024, type=int)
|
||||||
# eval
|
# eval
|
||||||
parser.add_argument('--eval_freq', default=10000, type=int)
|
parser.add_argument('--eval_freq', default=10000, type=int)
|
||||||
parser.add_argument('--num_eval_episodes', default=10, type=int)
|
parser.add_argument('--num_eval_episodes', default=10, type=int)
|
||||||
# critic
|
# critic
|
||||||
parser.add_argument('--critic_lr', default=1e-4, type=float)
|
parser.add_argument('--critic_lr', default=1e-3, type=float)
|
||||||
parser.add_argument('--critic_beta', default=0.9, type=float)
|
parser.add_argument('--critic_beta', default=0.9, type=float)
|
||||||
parser.add_argument('--critic_tau', default=0.01, type=float)
|
parser.add_argument('--critic_tau', default=0.01, type=float)
|
||||||
parser.add_argument('--critic_target_update_freq', default=2, type=int)
|
parser.add_argument('--critic_target_update_freq', default=2, type=int)
|
||||||
# actor
|
# actor
|
||||||
parser.add_argument('--actor_lr', default=1e-4, type=float)
|
parser.add_argument('--actor_lr', default=1e-3, type=float)
|
||||||
parser.add_argument('--actor_beta', default=0.9, type=float)
|
parser.add_argument('--actor_beta', default=0.9, type=float)
|
||||||
parser.add_argument('--actor_log_std_min', default=-10, type=float)
|
parser.add_argument('--actor_log_std_min', default=-10, type=float)
|
||||||
parser.add_argument('--actor_log_std_max', default=2, type=float)
|
parser.add_argument('--actor_log_std_max', default=2, type=float)
|
||||||
parser.add_argument('--actor_update_freq', default=2, type=int)
|
parser.add_argument('--actor_update_freq', default=2, type=int)
|
||||||
# encoder/decoder
|
# encoder/decoder
|
||||||
parser.add_argument('--encoder_type', default='pixel', type=str)
|
parser.add_argument('--encoder_type', default='pixel', type=str)
|
||||||
parser.add_argument('--encoder_feature_dim', default=250, type=int)
|
parser.add_argument('--encoder_feature_dim', default=50, type=int)
|
||||||
parser.add_argument('--encoder_lr', default=1e-4, type=float)
|
parser.add_argument('--encoder_lr', default=1e-3, type=float)
|
||||||
parser.add_argument('--encoder_tau', default=0.05, type=float)
|
parser.add_argument('--encoder_tau', default=0.05, type=float)
|
||||||
parser.add_argument('--decoder_type', default='pixel', type=str)
|
parser.add_argument('--decoder_type', default='pixel', type=str)
|
||||||
parser.add_argument('--decoder_lr', default=1e-4, type=float)
|
parser.add_argument('--decoder_lr', default=1e-3, type=float)
|
||||||
parser.add_argument('--decoder_update_freq', default=1, type=int)
|
parser.add_argument('--decoder_update_freq', default=1, type=int)
|
||||||
parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float)
|
parser.add_argument('--decoder_latent_lambda', default=1e-6, type=float)
|
||||||
parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float)
|
parser.add_argument('--decoder_weight_lambda', default=1e-7, type=float)
|
||||||
@ -154,25 +153,9 @@ def main():
|
|||||||
)
|
)
|
||||||
env.seed(args.seed)
|
env.seed(args.seed)
|
||||||
|
|
||||||
env_test = dmc2gym.make(
|
|
||||||
domain_name=args.domain_name,
|
|
||||||
task_name=args.task_name,
|
|
||||||
seed=args.seed,
|
|
||||||
visualize_reward=False,
|
|
||||||
from_pixels=(args.encoder_type == 'pixel'),
|
|
||||||
height=args.image_size,
|
|
||||||
width=args.image_size,
|
|
||||||
frame_skip=args.action_repeat,
|
|
||||||
img_source=args.img_source,
|
|
||||||
resource_files=args.resource_files_test,
|
|
||||||
total_frames=args.total_frames
|
|
||||||
)
|
|
||||||
env_test.seed(args.seed)
|
|
||||||
|
|
||||||
# stack several consecutive frames together
|
# stack several consecutive frames together
|
||||||
if args.encoder_type == 'pixel':
|
if args.encoder_type == 'pixel':
|
||||||
env = utils.FrameStack(env, k=args.frame_stack)
|
env = utils.FrameStack(env, k=args.frame_stack)
|
||||||
env_test = utils.FrameStack(env_test, k=args.frame_stack)
|
|
||||||
|
|
||||||
utils.make_dir(args.work_dir)
|
utils.make_dir(args.work_dir)
|
||||||
video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
|
video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
|
||||||
@ -219,7 +202,7 @@ def main():
|
|||||||
# evaluate agent periodically
|
# evaluate agent periodically
|
||||||
if step % args.eval_freq == 0:
|
if step % args.eval_freq == 0:
|
||||||
L.log('eval/episode', episode, step)
|
L.log('eval/episode', episode, step)
|
||||||
evaluate(env_test, agent, video, args.num_eval_episodes, L, step)
|
evaluate(env, agent, video, args.num_eval_episodes, L, step)
|
||||||
if args.save_model:
|
if args.save_model:
|
||||||
agent.save(model_dir, step)
|
agent.save(model_dir, step)
|
||||||
if args.save_buffer:
|
if args.save_buffer:
|
||||||
@ -235,28 +218,65 @@ def main():
|
|||||||
|
|
||||||
L.log('train/episode', episode, step)
|
L.log('train/episode', episode, step)
|
||||||
|
|
||||||
|
if episode_step == 0:
|
||||||
|
last_obs = obs
|
||||||
|
# sample action for data collection
|
||||||
|
if step < args.init_steps:
|
||||||
|
last_action = env.action_space.sample()
|
||||||
|
else:
|
||||||
|
with utils.eval_mode(agent):
|
||||||
|
last_action = agent.sample_action(last_obs)
|
||||||
|
|
||||||
|
curr_obs, last_reward, last_done, _ = env.step(last_action)
|
||||||
|
|
||||||
|
# allow infinit bootstrap
|
||||||
|
last_done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(last_done)
|
||||||
|
episode_reward += last_reward
|
||||||
|
|
||||||
# sample action for data collection
|
# sample action for data collection
|
||||||
if step < args.init_steps:
|
if step < args.init_steps:
|
||||||
action = env.action_space.sample()
|
action = env.action_space.sample()
|
||||||
else:
|
else:
|
||||||
with utils.eval_mode(agent):
|
with utils.eval_mode(agent):
|
||||||
action = agent.sample_action(obs)
|
action = agent.sample_action(curr_obs)
|
||||||
|
|
||||||
|
next_obs, reward, done, _ = env.step(action)
|
||||||
|
|
||||||
|
# allow infinit bootstrap
|
||||||
|
done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
|
||||||
|
episode_reward += reward
|
||||||
|
|
||||||
|
replay_buffer.add(last_obs, last_action, last_reward, curr_obs, last_done_bool, action, reward, next_obs, done_bool)
|
||||||
|
|
||||||
|
last_obs = curr_obs
|
||||||
|
last_action = action
|
||||||
|
last_reward = reward
|
||||||
|
last_done = done
|
||||||
|
curr_obs = next_obs
|
||||||
|
|
||||||
|
# sample action for data collection
|
||||||
|
if step < args.init_steps:
|
||||||
|
action = env.action_space.sample()
|
||||||
|
else:
|
||||||
|
with utils.eval_mode(agent):
|
||||||
|
action = agent.sample_action(curr_obs)
|
||||||
|
|
||||||
|
|
||||||
# run training update
|
# run training update
|
||||||
if step >= args.init_steps:
|
if step >= args.init_steps:
|
||||||
num_updates = args.init_steps if step == args.init_steps else 1
|
#num_updates = args.init_steps if step == args.init_steps else 1
|
||||||
|
num_updates = 1 if step == args.init_steps else 1
|
||||||
for _ in range(num_updates):
|
for _ in range(num_updates):
|
||||||
agent.update(replay_buffer, L, step)
|
agent.update(replay_buffer, L, step)
|
||||||
|
|
||||||
next_obs, reward, done, _ = env.step(action)
|
next_obs, reward, done, _ = env.step(action)
|
||||||
|
|
||||||
# allow infinit bootstrap
|
# allow infinit bootstrap
|
||||||
done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
|
done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
|
||||||
done
|
|
||||||
)
|
|
||||||
episode_reward += reward
|
episode_reward += reward
|
||||||
|
|
||||||
replay_buffer.add(obs, action, reward, next_obs, done_bool)
|
#replay_buffer.add(obs, action, reward, next_obs, done_bool)
|
||||||
|
replay_buffer.add(last_obs, last_action, last_reward, curr_obs, last_done_bool, action, reward, next_obs, done_bool)
|
||||||
|
|
||||||
obs = next_obs
|
obs = next_obs
|
||||||
episode_step += 1
|
episode_step += 1
|
||||||
|
50
utils.py
50
utils.py
@ -75,18 +75,26 @@ class ReplayBuffer(object):
|
|||||||
# the proprioceptive obs is stored as float32, pixels obs as uint8
|
# the proprioceptive obs is stored as float32, pixels obs as uint8
|
||||||
obs_dtype = np.float32 if len(obs_shape) == 1 else np.uint8
|
obs_dtype = np.float32 if len(obs_shape) == 1 else np.uint8
|
||||||
|
|
||||||
self.obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
|
self.last_obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
|
||||||
|
self.curr_obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
|
||||||
self.next_obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
|
self.next_obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
|
||||||
|
self.last_actions = np.empty((capacity, *action_shape), dtype=np.float32)
|
||||||
self.actions = np.empty((capacity, *action_shape), dtype=np.float32)
|
self.actions = np.empty((capacity, *action_shape), dtype=np.float32)
|
||||||
|
self.last_rewards = np.empty((capacity, 1), dtype=np.float32)
|
||||||
self.rewards = np.empty((capacity, 1), dtype=np.float32)
|
self.rewards = np.empty((capacity, 1), dtype=np.float32)
|
||||||
|
self.last_not_dones = np.empty((capacity, 1), dtype=np.float32)
|
||||||
self.not_dones = np.empty((capacity, 1), dtype=np.float32)
|
self.not_dones = np.empty((capacity, 1), dtype=np.float32)
|
||||||
|
|
||||||
self.idx = 0
|
self.idx = 0
|
||||||
self.last_save = 0
|
self.last_save = 0
|
||||||
self.full = False
|
self.full = False
|
||||||
|
|
||||||
def add(self, obs, action, reward, next_obs, done):
|
def add(self, last_obs, last_action, last_reward, curr_obs, last_done, action, reward, next_obs, done):
|
||||||
np.copyto(self.obses[self.idx], obs)
|
np.copyto(self.last_obses[self.idx], last_obs)
|
||||||
|
np.copyto(self.last_actions[self.idx], last_action)
|
||||||
|
np.copyto(self.last_rewards[self.idx], last_reward)
|
||||||
|
np.copyto(self.curr_obses[self.idx], curr_obs)
|
||||||
|
np.copyto(self.last_not_dones[self.idx], not last_done)
|
||||||
np.copyto(self.actions[self.idx], action)
|
np.copyto(self.actions[self.idx], action)
|
||||||
np.copyto(self.rewards[self.idx], reward)
|
np.copyto(self.rewards[self.idx], reward)
|
||||||
np.copyto(self.next_obses[self.idx], next_obs)
|
np.copyto(self.next_obses[self.idx], next_obs)
|
||||||
@ -96,29 +104,35 @@ class ReplayBuffer(object):
|
|||||||
self.full = self.full or self.idx == 0
|
self.full = self.full or self.idx == 0
|
||||||
|
|
||||||
def sample(self):
|
def sample(self):
|
||||||
begin = 2
|
|
||||||
idxs = np.random.randint(
|
idxs = np.random.randint(
|
||||||
begin, self.capacity if self.full else self.idx, size=self.batch_size
|
0, self.capacity if self.full else self.idx, size=self.batch_size
|
||||||
)
|
)
|
||||||
past_idxs = idxs - begin
|
|
||||||
|
|
||||||
obses = torch.as_tensor(np.swapaxes(np.asarray([self.obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float()
|
last_obses = torch.as_tensor(self.last_obses[idxs], device=self.device).float()
|
||||||
actions = torch.as_tensor(np.swapaxes(np.asarray([self.actions[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
|
last_actions = torch.as_tensor(self.last_actions[idxs], device=self.device)
|
||||||
rewards = torch.as_tensor(np.swapaxes(np.asarray([self.rewards[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
|
last_rewards = torch.as_tensor(self.last_rewards[idxs], device=self.device)
|
||||||
next_obses = torch.as_tensor(np.swapaxes(np.asarray([self.next_obses[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device).float()
|
curr_obses = torch.as_tensor(self.curr_obses[idxs], device=self.device).float()
|
||||||
not_dones = torch.as_tensor(np.swapaxes(np.asarray([self.not_dones[past_idxs:idxs] for past_idxs, idxs in zip(past_idxs, idxs)]),0,1), device=self.device)
|
last_not_dones = torch.as_tensor(self.last_not_dones[idxs], device=self.device)
|
||||||
|
actions = torch.as_tensor(self.actions[idxs], device=self.device)
|
||||||
|
rewards = torch.as_tensor(self.rewards[idxs], device=self.device)
|
||||||
|
next_obses = torch.as_tensor(self.next_obses[idxs], device=self.device).float()
|
||||||
|
not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device)
|
||||||
|
|
||||||
return obses, actions, rewards, next_obses, not_dones
|
return last_obses, last_actions, last_rewards, curr_obses, last_not_dones, actions, rewards, next_obses, not_dones
|
||||||
|
|
||||||
def save(self, save_dir):
|
def save(self, save_dir):
|
||||||
if self.idx == self.last_save:
|
if self.idx == self.last_save:
|
||||||
return
|
return
|
||||||
path = os.path.join(save_dir, '%d_%d.pt' % (self.last_save, self.idx))
|
path = os.path.join(save_dir, '%d_%d.pt' % (self.last_save, self.idx))
|
||||||
payload = [
|
payload = [
|
||||||
self.obses[self.last_save:self.idx],
|
self.last_obses[self.last_save:self.idx],
|
||||||
self.next_obses[self.last_save:self.idx],
|
self.last_actions[self.last_save:self.idx],
|
||||||
|
self.last_rewards[self.last_save:self.idx],
|
||||||
|
self.curr_obses[self.last_save:self.idx],
|
||||||
|
self.last_not_dones[self.last_save:self.idx],
|
||||||
self.actions[self.last_save:self.idx],
|
self.actions[self.last_save:self.idx],
|
||||||
self.rewards[self.last_save:self.idx],
|
self.rewards[self.last_save:self.idx],
|
||||||
|
self.next_obses[self.last_save:self.idx],
|
||||||
self.not_dones[self.last_save:self.idx]
|
self.not_dones[self.last_save:self.idx]
|
||||||
]
|
]
|
||||||
self.last_save = self.idx
|
self.last_save = self.idx
|
||||||
@ -132,10 +146,14 @@ class ReplayBuffer(object):
|
|||||||
path = os.path.join(save_dir, chunk)
|
path = os.path.join(save_dir, chunk)
|
||||||
payload = torch.load(path)
|
payload = torch.load(path)
|
||||||
assert self.idx == start
|
assert self.idx == start
|
||||||
self.obses[start:end] = payload[0]
|
self.last_obses[start:end] = payload[0]
|
||||||
self.next_obses[start:end] = payload[1]
|
self.last_actions[start:end] = payload[1]
|
||||||
|
self.last_rewards[start:end] = payload[2]
|
||||||
|
self.curr_obses[start:end] = payload[3]
|
||||||
|
self.last_not_dones[start:end] = payload[4]
|
||||||
self.actions[start:end] = payload[2]
|
self.actions[start:end] = payload[2]
|
||||||
self.rewards[start:end] = payload[3]
|
self.rewards[start:end] = payload[3]
|
||||||
|
self.next_obses[start:end] = payload[4]
|
||||||
self.not_dones[start:end] = payload[4]
|
self.not_dones[start:end] = payload[4]
|
||||||
self.idx = end
|
self.idx = end
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user