From 78eccba14bb39de832942d6ca2029bd81f2590eb Mon Sep 17 00:00:00 2001 From: "nikolaus.feith" Date: Mon, 6 Feb 2023 15:43:30 +0100 Subject: [PATCH] added more Acquisition functions --- AcquistionFunctions/ConfidenceBound.py | 14 +++ .../ProbabilityOfImprovement.py | 15 +++ BayesianOptimization/BOwithGym.py | 100 +++++++++++++++--- PolicyModel/GaussianModel.py | 6 +- 4 files changed, 119 insertions(+), 16 deletions(-) create mode 100644 AcquistionFunctions/ConfidenceBound.py create mode 100644 AcquistionFunctions/ProbabilityOfImprovement.py diff --git a/AcquistionFunctions/ConfidenceBound.py b/AcquistionFunctions/ConfidenceBound.py new file mode 100644 index 0000000..1a2cbdf --- /dev/null +++ b/AcquistionFunctions/ConfidenceBound.py @@ -0,0 +1,14 @@ +import numpy as np +from scipy.stats import norm + +def ConfidenceBound(gp, X, nr_test, nr_weights, lam=1.2, seed=None, lower=-1.0, upper=1.0): + y_hat = gp.predict(X) + best_y = max(y_hat) + rng = np.random.default_rng(seed=seed) + X_test = rng.uniform(lower, upper, (nr_test, nr_weights)) + mu, sigma = gp.predict(X_test, return_std=True) + cb = mu + lam * sigma + + idx = np.argmax(cb) + X_next = X_test[idx, :] + return X_next diff --git a/AcquistionFunctions/ProbabilityOfImprovement.py b/AcquistionFunctions/ProbabilityOfImprovement.py new file mode 100644 index 0000000..3253438 --- /dev/null +++ b/AcquistionFunctions/ProbabilityOfImprovement.py @@ -0,0 +1,15 @@ +import numpy as np +from scipy.stats import norm + +def ProbabilityOfImprovement(gp, X, nr_test, nr_weights, kappa=2.576, seed=None, lower=-1.0, upper=1.0): + y_hat = gp.predict(X) + best_y = max(y_hat) + rng = np.random.default_rng(seed=seed) + X_test = rng.uniform(lower, upper, (nr_test, nr_weights)) + mu, sigma = gp.predict(X_test, return_std=True) + z = (mu - best_y - kappa) / sigma + pi = norm.cdf(z) + + idx = np.argmax(pi) + X_next = X_test[idx, :] + return X_next diff --git a/BayesianOptimization/BOwithGym.py b/BayesianOptimization/BOwithGym.py index e114b70..025f142 100644 --- a/BayesianOptimization/BOwithGym.py +++ b/BayesianOptimization/BOwithGym.py @@ -4,6 +4,8 @@ from sklearn.gaussian_process.kernels import Matern from PolicyModel.GaussianModel import GaussianPolicy from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement +from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement +from AcquistionFunctions.ConfidenceBound import ConfidenceBound from ToyTask.MountainCarGym import Continuous_MountainCarEnv @@ -12,6 +14,7 @@ import time import matplotlib.pyplot as plt + class BayesianOptimization: def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None): self.env = env @@ -19,11 +22,12 @@ class BayesianOptimization: self.acq = acq self.X = None self.Y = None + self.counter_array = np.zeros((1, 1)) self.gp = None self.episode = 0 self.best_reward = np.empty((1, 1)) - self.distance_penalty = 100 + self.distance_penalty = 0 self.nr_policy_weights = nr_weights self.nr_steps = nr_step @@ -40,7 +44,8 @@ class BayesianOptimization: def initialize(self): self.env.reset() - self.env.render() + if self.env.render_mode == 'human': + self.env.render() self.X = np.zeros((self.nr_init, self.nr_policy_weights)) self.Y = np.zeros((self.nr_init, 1)) @@ -50,10 +55,11 @@ class BayesianOptimization: self.X[i, :] = self.policy_model.weights.T policy = self.policy_model.policy_rollout() - reward = self.runner(policy) + reward, step_count = self.runner(policy) self.Y[i] = reward + self.gp = GaussianProcessRegressor(Matern(nu=1.5)) self.gp.fit(self.X, self.Y) @@ -66,16 +72,29 @@ class BayesianOptimization: output = self.env.step(action) env_reward += output[1] done = output[2] - time.sleep(0.0001) + if self.env.render_mode == 'human': + time.sleep(0.0001) step_count += 1 if step_count >= self.nr_steps: done = True distance = -(self.env.goal_position - output[0][0]) env_reward += distance * self.distance_penalty - time.sleep(0.25) + if self.counter_array[0] == 0: + + if step_count == 100: + step_count = np.NAN + self.counter_array[0] = step_count + + else: + if step_count == 100: + step_count = np.NAN + self.counter_array = np.vstack((self.counter_array, step_count)) + + if self.env.render_mode == 'human': + time.sleep(0.25) self.env.reset() - return env_reward + return env_reward, step_count def next_observation(self): if self.acq == 'ei': @@ -83,19 +102,44 @@ class BayesianOptimization: self.X, self.nr_test, self.nr_policy_weights, + kappa=0, seed=self.policy_seed, lower=self.lowerb, upper=self.upperb) return x_next + elif self.acq == 'pi': + x_next = ProbabilityOfImprovement(self.gp, + self.X, + self.nr_test, + self.nr_policy_weights, + kappa=2.576, + seed=self.policy_seed, + lower=self.lowerb, + upper=self.upperb) + + return x_next + + elif self.acq == 'cb': + x_next = ConfidenceBound(self.gp, + self.X, + self.nr_test, + self.nr_policy_weights, + lam=2.576, + seed=self.policy_seed, + lower=self.lowerb, + upper=self.upperb) + + return x_next + else: raise NotImplementedError def eval_new_observation(self, x_next): self.policy_model.weights = x_next policy = self.policy_model.policy_rollout() - reward = self.runner(policy) + reward, step_count = self.runner(policy) self.X = np.vstack((self.X, x_next)) self.Y = np.vstack((self.Y, reward)) @@ -108,28 +152,58 @@ class BayesianOptimization: self.best_reward = np.vstack((self.best_reward, max(self.Y))) self.episode += 1 - self.policy_model.plot_policy() + return step_count def plot_reward(self): epsiodes = np.linspace(0, self.episode, self.episode) plt.plot(epsiodes, self.best_reward) + plt.show() + def plot_objective_function(self): + plt.scatter(self.X[:, 0], self.Y) + + x_dom = np.linspace(self.lowerb * np.ones((1, 6)), self.upperb * np.ones((1, 6)), 100).reshape(-1, self.nr_policy_weights) + y_plot, y_sigma = self.gp.predict(x_dom, return_std=True) + + y_plot = y_plot.reshape(-1) + y_sigma = y_sigma.reshape(-1) + print(y_plot.shape, x_dom.shape) + + plt.plot(x_dom[:, 0], y_plot) + plt.fill_between( + x_dom[:, 0], + y_plot - 1.96 * y_sigma, + y_plot + 1.96 * y_sigma, + alpha=0.5 + ) + plt.show() + + def get_best_result(self): + y_hat = self.gp.predict(self.X) + idx = np.argmax(y_hat) + x_max = self.X[idx, :] + self.policy_model.weights = x_max + self.policy_model.policy_rollout() + print(self.counter_array[idx], idx) + self.policy_model.plot_policy(finished=self.counter_array[idx]) def main(): nr_steps = 100 - env = Continuous_MountainCarEnv(render_mode='human') - bo = BayesianOptimization(env, nr_steps) + env = Continuous_MountainCarEnv() # render_mode='human' + bo = BayesianOptimization(env, nr_steps, acq='cb') bo.initialize() iteration_steps = 200 for i in range(iteration_steps): x_next = bo.next_observation() - bo.eval_new_observation(x_next) + step_count = bo.eval_new_observation(x_next) - print(bo.episode, bo.best_reward[-1][0]) + print(bo.episode, bo.best_reward[-1][0], step_count) bo.plot_reward() - bo.policy_model.plot_policy() + bo.get_best_result() + plt.show() + if __name__ == "__main__": main() diff --git a/PolicyModel/GaussianModel.py b/PolicyModel/GaussianModel.py index 21cbf35..4574201 100644 --- a/PolicyModel/GaussianModel.py +++ b/PolicyModel/GaussianModel.py @@ -35,15 +35,15 @@ class GaussianPolicy: return self.trajectory - def plot_policy(self): + def plot_policy(self, finished=np.NAN): x = np.linspace(0, self.nr_steps, self.nr_steps) plt.plot(x, self.trajectory) + if finished != np.NAN: + plt.vlines(finished, -1, 1, colors='red') # for i in self.mean: # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2) # plt.plot(x, gaussian) - plt.show() - def main(): policy = GaussianPolicy(1, 50)