ActiveBOToytask/runner/BODmRunner.py

# Control Suite
from dm_control import suite

# General
import copy
import numpy as np

# Graphics-related
import matplotlib.pyplot as plt

# Bayesian Optimization
from BayesianOptimization.BOwithDM import BayesianOptimization

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

seed = None
random_state = np.random.RandomState(seed=seed)
env = suite.load('finger', 'turn_easy', task_kwargs={'random': random_state})
spec = env.action_spec()
print(spec)
time_step = env.reset()

nr_steps = 100
nr_runs = 1
nr_dims = spec.shape[0]
iteration_steps = 10
acquisition_fun = "Expected Improvement"

nr_weights = 15
nr_inits = 3

# storage arrays
best_weights = np.zeros((nr_runs, nr_weights, nr_dims))
best_rewards = np.zeros((nr_runs, iteration_steps))
rewards = np.zeros((nr_runs, iteration_steps))


def post_processing(reward):
    reward_mean = np.mean(reward, axis=0)
    reward_std = np.std(reward, axis=0)

    return reward_mean, reward_std


def plot_reward(mean, std):
    eps = np.linspace(0, mean.shape[0], mean.shape[0])
    plt.plot(eps, mean)

    plt.fill_between(
        eps,
        mean - 1.96 * std,
        mean + 1.96 * std,
        alpha=0.5
    )
    plt.show()


def runner(env_, policy_):
    reward = 0
    env_.reset()
    for step in range(nr_steps):
        action = policy_[step]
        output = env_.step(action)
        print(output.reward)

        if output.reward != 0:
            reward += output.reward * 10
            break

        reward += -1.0

    return reward


def main():
    global best_weights, rewards
    bo = BayesianOptimization(nr_steps, nr_dims, nr_weights, acq=acquisition_fun, seed=seed)
    for run in range(nr_runs):
        bo.reset_bo()
        print(f'Run: {run}')
        # initialization
        for init in range(nr_inits):
            bo.policy_model.random_weights()
            policy = bo.policy_model.rollout()

            reward = runner(env, policy)
            x = bo.policy_model.get_x()
            x = x.reshape(nr_weights * nr_dims, )

            bo.add_observation(reward, x)

        # Bayesian Optimization
        for n in range(iteration_steps):
            x_next = bo.next_observation()

            bo.policy_model.set_weights(x_next)
            policy = bo.policy_model.rollout()

            reward = runner(env, policy)
            x_next = x_next.reshape(nr_weights * nr_dims, )

            bo.add_observation(reward, x_next)
            rewards[run, n] = reward

            y_max, _, _ = bo.get_best_result()
            best_rewards[run, n] = y_max

    reward_mean, reward_std = post_processing(best_rewards)
    plot_reward(reward_mean, reward_std)


if __name__ == '__main__':
    main()

# for i in range(nr_steps):
#     action = random_state.uniform(spec.minimum, spec.maximum, spec.shape)
#     time_step = env.step(action)
#
#     camera0 = env.physics.render(camera_id=0, height=400, width=600)
#     frames.append(camera0)  # Directly append the frame without any modification
#     rewards.append(time_step.reward)
#     observations.append(copy.deepcopy(time_step.observation))
#     ticks.append(env.physics.data.time)
#
# # Show video and plot reward and observations
# for i in range(len(frames)):
#     if i % 20 == 0:  # Display every 20th frame for example purposes
#         print(frames[i].shape)
#         fig, ax = plt.subplots(1, 1)
#         ax.imshow(frames[i])
#         ax.axis('off')  # Turn off the axis
#
#         # Remove any whitespace from the edges
#         ax.set_xticks([])
#         ax.set_yticks([])
#         plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
#         plt.margins(0, 0)
#         plt.gca().xaxis.set_major_locator(plt.NullLocator())
#         plt.gca().yaxis.set_major_locator(plt.NullLocator())
#
#         plt.show()