ActiveBOToytask/runner/BODmRunner.py

145 lines
3.8 KiB
Python
Raw Permalink Normal View History

2023-08-17 16:26:23 +00:00
# Control Suite
from dm_control import suite
# General
import copy
import numpy as np
# Graphics-related
import matplotlib.pyplot as plt
# Bayesian Optimization
from BayesianOptimization.BOwithDM import BayesianOptimization
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
seed = None
random_state = np.random.RandomState(seed=seed)
2023-09-10 18:54:34 +00:00
env = suite.load('finger', 'turn_easy', task_kwargs={'random': random_state})
2023-08-17 16:26:23 +00:00
spec = env.action_spec()
2023-09-10 18:54:34 +00:00
print(spec)
2023-08-17 16:26:23 +00:00
time_step = env.reset()
nr_steps = 100
2023-09-10 18:54:34 +00:00
nr_runs = 1
2023-08-17 16:26:23 +00:00
nr_dims = spec.shape[0]
2023-09-10 18:54:34 +00:00
iteration_steps = 10
2023-08-17 16:26:23 +00:00
acquisition_fun = "Expected Improvement"
nr_weights = 15
nr_inits = 3
# storage arrays
best_weights = np.zeros((nr_runs, nr_weights, nr_dims))
best_rewards = np.zeros((nr_runs, iteration_steps))
rewards = np.zeros((nr_runs, iteration_steps))
def post_processing(reward):
reward_mean = np.mean(reward, axis=0)
reward_std = np.std(reward, axis=0)
return reward_mean, reward_std
def plot_reward(mean, std):
eps = np.linspace(0, mean.shape[0], mean.shape[0])
plt.plot(eps, mean)
plt.fill_between(
eps,
mean - 1.96 * std,
mean + 1.96 * std,
alpha=0.5
)
plt.show()
def runner(env_, policy_):
reward = 0
env_.reset()
for step in range(nr_steps):
action = policy_[step]
output = env_.step(action)
2023-09-10 18:54:34 +00:00
print(output.reward)
2023-08-17 16:26:23 +00:00
if output.reward != 0:
reward += output.reward * 10
break
reward += -1.0
return reward
def main():
global best_weights, rewards
bo = BayesianOptimization(nr_steps, nr_dims, nr_weights, acq=acquisition_fun, seed=seed)
for run in range(nr_runs):
bo.reset_bo()
print(f'Run: {run}')
# initialization
for init in range(nr_inits):
bo.policy_model.random_weights()
policy = bo.policy_model.rollout()
reward = runner(env, policy)
x = bo.policy_model.get_x()
x = x.reshape(nr_weights * nr_dims, )
bo.add_observation(reward, x)
# Bayesian Optimization
for n in range(iteration_steps):
x_next = bo.next_observation()
bo.policy_model.set_weights(x_next)
policy = bo.policy_model.rollout()
reward = runner(env, policy)
x_next = x_next.reshape(nr_weights * nr_dims, )
bo.add_observation(reward, x_next)
rewards[run, n] = reward
y_max, _, _ = bo.get_best_result()
best_rewards[run, n] = y_max
reward_mean, reward_std = post_processing(best_rewards)
plot_reward(reward_mean, reward_std)
if __name__ == '__main__':
main()
# for i in range(nr_steps):
# action = random_state.uniform(spec.minimum, spec.maximum, spec.shape)
# time_step = env.step(action)
#
# camera0 = env.physics.render(camera_id=0, height=400, width=600)
# frames.append(camera0) # Directly append the frame without any modification
# rewards.append(time_step.reward)
# observations.append(copy.deepcopy(time_step.observation))
# ticks.append(env.physics.data.time)
#
# # Show video and plot reward and observations
# for i in range(len(frames)):
# if i % 20 == 0: # Display every 20th frame for example purposes
# print(frames[i].shape)
# fig, ax = plt.subplots(1, 1)
# ax.imshow(frames[i])
# ax.axis('off') # Turn off the axis
#
# # Remove any whitespace from the edges
# ax.set_xticks([])
# ax.set_yticks([])
# plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
# plt.margins(0, 0)
# plt.gca().xaxis.set_major_locator(plt.NullLocator())
# plt.gca().yaxis.set_major_locator(plt.NullLocator())
#
# plt.show()