import numpy as np from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern from PolicyModel.GaussianModel import GaussianPolicy from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement from AcquistionFunctions.ConfidenceBound import ConfidenceBound from ToyTask.MountainCarGym import Continuous_MountainCarEnv from ToyTask.Pendulum import PendulumEnv import time import matplotlib.pyplot as plt class BayesianOptimization: def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None, env_seed=None): self.env = env self.env_seed = env_seed self.nr_init = nr_init self.acq = acq self.X = None self.Y = None self.counter_array = np.zeros((1, 1)) self.gp = None self.episode = 0 self.best_reward = np.empty((1, 1)) self.distance_penalty = 0 self.nr_policy_weights = nr_weights self.nr_steps = nr_step self.policy_seed = policy_seed self.lowerb = -1.0 self.upperb = 1.0 self.policy_model = GaussianPolicy(self.nr_policy_weights, self.nr_steps, self.policy_seed, self.lowerb, self.upperb) self.nr_test = 100 def reset_bo(self): self.counter_array = np.zeros((1, 1)) self.gp = None self.episode = 0 self.best_reward = np.empty((1, 1)) def initialize(self): self.env.reset(seed=self.env_seed) self.reset_bo() if self.env.render_mode == 'human': self.env.render() self.X = np.zeros((self.nr_init, self.nr_policy_weights)) self.Y = np.zeros((self.nr_init, 1)) for i in range(self.nr_init): self.policy_model.random_policy() self.X[i, :] = self.policy_model.weights.T policy = self.policy_model.policy_rollout() reward, step_count = self.runner(policy) self.Y[i] = reward self.gp = GaussianProcessRegressor(Matern(nu=1.5)) self.gp.fit(self.X, self.Y) def runner(self, policy): self.env.reset(seed=self.env_seed) done = False step_count = 0 env_reward = 0.0 while not done: action = policy[step_count] action_clipped = action.clip(min=-1.0, max=1.0) output = self.env.step(action_clipped.astype(np.float32)) env_reward += output[1] done = output[2] if self.env.render_mode == 'human': time.sleep(0.0001) step_count += 1 if step_count >= self.nr_steps: done = True if self.counter_array[0] == 0: if step_count == 100: step_count = np.NAN self.counter_array[0] = step_count else: if step_count == 100: step_count = np.NAN self.counter_array = np.vstack((self.counter_array, step_count)) if self.env.render_mode == 'human': time.sleep(0.25) return env_reward, step_count def next_observation(self): if self.acq == 'ei': x_next = ExpectedImprovement(self.gp, self.X, self.nr_test, self.nr_policy_weights, kappa=0, seed=self.policy_seed, lower=self.lowerb, upper=self.upperb) return x_next elif self.acq == 'pi': x_next = ProbabilityOfImprovement(self.gp, self.X, self.nr_test, self.nr_policy_weights, kappa=0, seed=self.policy_seed, lower=self.lowerb, upper=self.upperb) return x_next elif self.acq == 'cb': x_next = ConfidenceBound(self.gp, self.X, self.nr_test, self.nr_policy_weights, lam=2.576, seed=self.policy_seed, lower=self.lowerb, upper=self.upperb) return x_next else: raise NotImplementedError def eval_new_observation(self, x_next): self.policy_model.weights = x_next policy = self.policy_model.policy_rollout() reward, step_count = self.runner(policy) self.X = np.vstack((self.X, x_next)) self.Y = np.vstack((self.Y, reward)) self.gp.fit(self.X, self.Y) if self.episode == 0: self.best_reward[0] = max(self.Y) else: self.best_reward = np.vstack((self.best_reward, max(self.Y))) self.episode += 1 return step_count def plot_reward(self): epsiodes = np.linspace(0, self.episode, self.episode) plt.plot(epsiodes, self.best_reward) plt.show() def plot_objective_function(self): plt.scatter(self.X[:, 0], self.Y) x_dom = np.linspace(self.lowerb * np.ones((1, 6)), self.upperb * np.ones((1, 6)), 100).reshape(-1, self.nr_policy_weights) y_plot, y_sigma = self.gp.predict(x_dom, return_std=True) y_plot = y_plot.reshape(-1) y_sigma = y_sigma.reshape(-1) print(y_plot.shape, x_dom.shape) plt.plot(x_dom[:, 0], y_plot) plt.fill_between( x_dom[:, 0], y_plot - 1.96 * y_sigma, y_plot + 1.96 * y_sigma, alpha=0.5 ) plt.show() def get_best_result(self, plotter=True): y_hat = self.gp.predict(self.X) idx = np.argmax(y_hat) x_max = self.X[idx, :] self.policy_model.weights = x_max self.policy_model.policy_rollout() if plotter: self.policy_model.plot_policy(finished=self.counter_array[idx]) else: return self.counter_array[idx] def main(): nr_steps = 100 env = PendulumEnv(render_mode='human') # render_mode='human' bo = BayesianOptimization(env, nr_steps, nr_weights=10, acq='ei', env_seed=1234) bo.initialize() iteration_steps = 100 for i in range(iteration_steps): x_next = bo.next_observation() step_count = bo.eval_new_observation(x_next) print(bo.episode, bo.best_reward[-1][0], step_count) bo.plot_reward() bo.get_best_result() plt.show() if __name__ == "__main__": main()