# Control Suite from dm_control import suite # General import copy import numpy as np # Graphics-related import matplotlib.pyplot as plt # Bayesian Optimization from BayesianOptimization.BOwithDM import BayesianOptimization import warnings from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings("ignore", category=ConvergenceWarning) seed = None random_state = np.random.RandomState(seed=seed) env = suite.load('reacher', 'hard', task_kwargs={'random': random_state}) spec = env.action_spec() time_step = env.reset() nr_steps = 100 nr_runs = 10 nr_dims = spec.shape[0] iteration_steps = 50 acquisition_fun = "Expected Improvement" nr_weights = 15 nr_inits = 3 # storage arrays best_weights = np.zeros((nr_runs, nr_weights, nr_dims)) best_rewards = np.zeros((nr_runs, iteration_steps)) rewards = np.zeros((nr_runs, iteration_steps)) def post_processing(reward): reward_mean = np.mean(reward, axis=0) reward_std = np.std(reward, axis=0) return reward_mean, reward_std def plot_reward(mean, std): eps = np.linspace(0, mean.shape[0], mean.shape[0]) plt.plot(eps, mean) plt.fill_between( eps, mean - 1.96 * std, mean + 1.96 * std, alpha=0.5 ) plt.show() def runner(env_, policy_): reward = 0 env_.reset() for step in range(nr_steps): action = policy_[step] output = env_.step(action) if output.reward != 0: reward += output.reward * 10 break reward += -1.0 return reward def main(): global best_weights, rewards bo = BayesianOptimization(nr_steps, nr_dims, nr_weights, acq=acquisition_fun, seed=seed) for run in range(nr_runs): bo.reset_bo() print(f'Run: {run}') # initialization for init in range(nr_inits): bo.policy_model.random_weights() policy = bo.policy_model.rollout() reward = runner(env, policy) x = bo.policy_model.get_x() x = x.reshape(nr_weights * nr_dims, ) bo.add_observation(reward, x) # Bayesian Optimization for n in range(iteration_steps): x_next = bo.next_observation() bo.policy_model.set_weights(x_next) policy = bo.policy_model.rollout() reward = runner(env, policy) x_next = x_next.reshape(nr_weights * nr_dims, ) bo.add_observation(reward, x_next) rewards[run, n] = reward y_max, _, _ = bo.get_best_result() best_rewards[run, n] = y_max reward_mean, reward_std = post_processing(best_rewards) plot_reward(reward_mean, reward_std) if __name__ == '__main__': main() # for i in range(nr_steps): # action = random_state.uniform(spec.minimum, spec.maximum, spec.shape) # time_step = env.step(action) # # camera0 = env.physics.render(camera_id=0, height=400, width=600) # frames.append(camera0) # Directly append the frame without any modification # rewards.append(time_step.reward) # observations.append(copy.deepcopy(time_step.observation)) # ticks.append(env.physics.data.time) # # # Show video and plot reward and observations # for i in range(len(frames)): # if i % 20 == 0: # Display every 20th frame for example purposes # print(frames[i].shape) # fig, ax = plt.subplots(1, 1) # ax.imshow(frames[i]) # ax.axis('off') # Turn off the axis # # # Remove any whitespace from the edges # ax.set_xticks([]) # ax.set_yticks([]) # plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0) # plt.margins(0, 0) # plt.gca().xaxis.set_major_locator(plt.NullLocator()) # plt.gca().yaxis.set_major_locator(plt.NullLocator()) # # plt.show()