143 lines
3.7 KiB
Python
143 lines
3.7 KiB
Python
# Control Suite
|
|
from dm_control import suite
|
|
|
|
# General
|
|
import copy
|
|
import numpy as np
|
|
|
|
# Graphics-related
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Bayesian Optimization
|
|
from BayesianOptimization.BOwithDM import BayesianOptimization
|
|
|
|
import warnings
|
|
from sklearn.exceptions import ConvergenceWarning
|
|
|
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
|
|
seed = None
|
|
random_state = np.random.RandomState(seed=seed)
|
|
env = suite.load('reacher', 'hard', task_kwargs={'random': random_state})
|
|
spec = env.action_spec()
|
|
time_step = env.reset()
|
|
|
|
nr_steps = 100
|
|
nr_runs = 10
|
|
nr_dims = spec.shape[0]
|
|
iteration_steps = 50
|
|
acquisition_fun = "Expected Improvement"
|
|
|
|
nr_weights = 15
|
|
nr_inits = 3
|
|
|
|
# storage arrays
|
|
best_weights = np.zeros((nr_runs, nr_weights, nr_dims))
|
|
best_rewards = np.zeros((nr_runs, iteration_steps))
|
|
rewards = np.zeros((nr_runs, iteration_steps))
|
|
|
|
|
|
def post_processing(reward):
|
|
reward_mean = np.mean(reward, axis=0)
|
|
reward_std = np.std(reward, axis=0)
|
|
|
|
return reward_mean, reward_std
|
|
|
|
|
|
def plot_reward(mean, std):
|
|
eps = np.linspace(0, mean.shape[0], mean.shape[0])
|
|
plt.plot(eps, mean)
|
|
|
|
plt.fill_between(
|
|
eps,
|
|
mean - 1.96 * std,
|
|
mean + 1.96 * std,
|
|
alpha=0.5
|
|
)
|
|
plt.show()
|
|
|
|
|
|
def runner(env_, policy_):
|
|
reward = 0
|
|
env_.reset()
|
|
for step in range(nr_steps):
|
|
action = policy_[step]
|
|
output = env_.step(action)
|
|
|
|
if output.reward != 0:
|
|
reward += output.reward * 10
|
|
break
|
|
|
|
reward += -1.0
|
|
|
|
return reward
|
|
|
|
|
|
def main():
|
|
global best_weights, rewards
|
|
bo = BayesianOptimization(nr_steps, nr_dims, nr_weights, acq=acquisition_fun, seed=seed)
|
|
for run in range(nr_runs):
|
|
bo.reset_bo()
|
|
print(f'Run: {run}')
|
|
# initialization
|
|
for init in range(nr_inits):
|
|
bo.policy_model.random_weights()
|
|
policy = bo.policy_model.rollout()
|
|
|
|
reward = runner(env, policy)
|
|
x = bo.policy_model.get_x()
|
|
x = x.reshape(nr_weights * nr_dims, )
|
|
|
|
bo.add_observation(reward, x)
|
|
|
|
# Bayesian Optimization
|
|
for n in range(iteration_steps):
|
|
x_next = bo.next_observation()
|
|
|
|
bo.policy_model.set_weights(x_next)
|
|
policy = bo.policy_model.rollout()
|
|
|
|
reward = runner(env, policy)
|
|
x_next = x_next.reshape(nr_weights * nr_dims, )
|
|
|
|
bo.add_observation(reward, x_next)
|
|
rewards[run, n] = reward
|
|
|
|
y_max, _, _ = bo.get_best_result()
|
|
best_rewards[run, n] = y_max
|
|
|
|
reward_mean, reward_std = post_processing(best_rewards)
|
|
plot_reward(reward_mean, reward_std)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|
|
# for i in range(nr_steps):
|
|
# action = random_state.uniform(spec.minimum, spec.maximum, spec.shape)
|
|
# time_step = env.step(action)
|
|
#
|
|
# camera0 = env.physics.render(camera_id=0, height=400, width=600)
|
|
# frames.append(camera0) # Directly append the frame without any modification
|
|
# rewards.append(time_step.reward)
|
|
# observations.append(copy.deepcopy(time_step.observation))
|
|
# ticks.append(env.physics.data.time)
|
|
#
|
|
# # Show video and plot reward and observations
|
|
# for i in range(len(frames)):
|
|
# if i % 20 == 0: # Display every 20th frame for example purposes
|
|
# print(frames[i].shape)
|
|
# fig, ax = plt.subplots(1, 1)
|
|
# ax.imshow(frames[i])
|
|
# ax.axis('off') # Turn off the axis
|
|
#
|
|
# # Remove any whitespace from the edges
|
|
# ax.set_xticks([])
|
|
# ax.set_yticks([])
|
|
# plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
|
|
# plt.margins(0, 0)
|
|
# plt.gca().xaxis.set_major_locator(plt.NullLocator())
|
|
# plt.gca().yaxis.set_major_locator(plt.NullLocator())
|
|
#
|
|
# plt.show()
|