From df79df98083d670f2d89bb99919cbda50137a914 Mon Sep 17 00:00:00 2001 From: "nikolaus.feith" Date: Thu, 2 Feb 2023 18:54:09 +0100 Subject: [PATCH] initial commit --- .idea/.gitignore | 8 ++ .idea/.name | 1 + .idea/ActiveBOToytask.iml | 10 ++ .idea/inspectionProfiles/Project_Default.xml | 53 +++++++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 ++ AcquistionFunctions/ExpectedImprovement.py | 15 ++ BayesianOptimization/BOwithGym.py | 135 ++++++++++++++++++ BayesianOptimization/BayesianOptimization.py | 126 ++++++++++++++++ PolicyModel/GaussianModel.py | 63 ++++++++ ToyTask/MountainCar.py | 67 +++++++++ 12 files changed, 496 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/.name create mode 100644 .idea/ActiveBOToytask.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 AcquistionFunctions/ExpectedImprovement.py create mode 100644 BayesianOptimization/BOwithGym.py create mode 100644 BayesianOptimization/BayesianOptimization.py create mode 100644 PolicyModel/GaussianModel.py create mode 100644 ToyTask/MountainCar.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..4b77b18 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +ActiveBOToytask \ No newline at end of file diff --git a/.idea/ActiveBOToytask.iml b/.idea/ActiveBOToytask.iml new file mode 100644 index 0000000..22a7bc1 --- /dev/null +++ b/.idea/ActiveBOToytask.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..10f4fa2 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,53 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..7e39aa8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..cbf624a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/AcquistionFunctions/ExpectedImprovement.py b/AcquistionFunctions/ExpectedImprovement.py new file mode 100644 index 0000000..6b17a06 --- /dev/null +++ b/AcquistionFunctions/ExpectedImprovement.py @@ -0,0 +1,15 @@ +import numpy as np +from scipy.stats import norm + +def ExpectedImprovement(gp, X, nr_test, nr_weights, kappa=2.576, seed=None, lower=-1.0, upper=1.0): + y_hat = gp.predict(X) + best_y = max(y_hat) + rng = np.random.default_rng(seed=seed) + X_test = rng.uniform(lower, upper, (nr_test, nr_weights)) + mu, sigma = gp.predict(X_test, return_std=True) + z = (mu - best_y - kappa) / sigma + ei = (mu - best_y - kappa) * norm.cdf(z) + sigma * norm.pdf(z) + + idx = np.argmax(ei) + X_next = X_test[idx, :] + return X_next diff --git a/BayesianOptimization/BOwithGym.py b/BayesianOptimization/BOwithGym.py new file mode 100644 index 0000000..ffe3097 --- /dev/null +++ b/BayesianOptimization/BOwithGym.py @@ -0,0 +1,135 @@ +import numpy as np +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern + +from PolicyModel.GaussianModel import GaussianPolicy +from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement + +import gym +import time + +import matplotlib.pyplot as plt + +from warnings import catch_warnings, simplefilter + +class BayesianOptimization: + def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=8, policy_seed=None): + self.env = env + self.nr_init = nr_init + self.acq = acq + self.X = None + self.Y = None + self.gp = None + + self.episode = 0 + self.best_reward = np.empty((1, 1)) + self.distance_penalty = 10 + + self.nr_policy_weights = nr_weights + self.nr_steps = nr_step + self.policy_seed = policy_seed + self.lowerb = -1.0 + self.upperb = 1.0 + self.policy_model = GaussianPolicy(self.nr_policy_weights, + self.nr_steps, + self.policy_seed, + self.lowerb, + self.upperb) + + self.nr_test = 100 + + def initialize(self): + self.env.reset() + self.env.render() + + self.X = np.zeros((self.nr_init, self.nr_policy_weights)) + self.Y = np.zeros((self.nr_init, 1)) + + for i in range(self.nr_init): + self.policy_model.random_policy() + self.X[i, :] = self.policy_model.weights.T + policy = self.policy_model.policy_rollout() + + reward = self.runner(policy) + + self.Y[i] = reward + + self.gp = GaussianProcessRegressor(Matern(nu=1.5)) + self.gp.fit(self.X, self.Y) + + def runner(self, policy): + done = False + step_count = 0 + env_reward = 0.0 + while not done: + action = policy[step_count] + output = self.env.step(action) + env_reward += output[1] + done = output[2] + time.sleep(0.0001) + step_count += 1 + if step_count >= self.nr_steps: + done = True + distance = -(self.env.goal_position - output[0][0]) + env_reward += distance * self.distance_penalty + + time.sleep(0.25) + self.env.reset() + return env_reward + + def next_observation(self): + if self.acq == 'ei': + x_next = ExpectedImprovement(self.gp, + self.X, + self.nr_test, + self.nr_policy_weights, + seed=self.policy_seed, + lower=self.lowerb, + upper=self.upperb) + + return x_next + + else: + raise NotImplementedError + + def eval_new_observation(self, x_next): + self.policy_model.weights = x_next + policy = self.policy_model.policy_rollout() + reward = self.runner(policy) + + self.X = np.vstack((self.X, x_next)) + self.Y = np.vstack((self.Y, reward)) + + self.gp.fit(self.X, self.Y) + + if self.episode == 0: + self.best_reward[0] = max(self.Y) + else: + self.best_reward = np.vstack((self.best_reward, max(self.Y))) + + self.episode += 1 + self.policy_model.plot_policy() + + def plot_reward(self): + epsiodes = np.linspace(0, self.episode, self.episode) + plt.plot(epsiodes, self.best_reward) + plt.show() + + +def main(): + nr_steps = 80 + env = gym.envs.make('MountainCarContinuous-v0', render_mode="human") + bo = BayesianOptimization(env, nr_steps) + bo.initialize() + iteration_steps = 100 + for i in range(iteration_steps): + x_next = bo.next_observation() + bo.eval_new_observation(x_next) + + print(bo.episode, bo.best_reward[-1][0]) + + bo.plot_reward() + bo.policy_model.plot_policy() + +if __name__ == "__main__": + main() diff --git a/BayesianOptimization/BayesianOptimization.py b/BayesianOptimization/BayesianOptimization.py new file mode 100644 index 0000000..1c52de6 --- /dev/null +++ b/BayesianOptimization/BayesianOptimization.py @@ -0,0 +1,126 @@ +import numpy as np +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern + +from PolicyModel.GaussianModel import GaussianPolicy +from ToyTask.MountainCar import MountainCarEnv +from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement + +import matplotlib.pyplot as plt + +from warnings import catch_warnings, simplefilter + +class BayesianOptimization: + def __init__(self, env, nr_init=3, acq='ei', nr_weights=8, policy_seed=None): + self.env = env + self.nr_init = nr_init + self.acq = acq + self.X = None + self.Y = None + self.gp = None + + self.episode = 0 + self.best_reward = np.empty((1, 1)) + + self.nr_policy_weights = nr_weights + self.nr_steps = env.max_step + self.policy_seed = policy_seed + self.lowerb = -1.0 + self.upperb = 1.0 + self.policy_model = GaussianPolicy(self.nr_policy_weights, + self.nr_steps, + self.policy_seed, + self.lowerb, + self.upperb) + + self.nr_test = 100 + + def initialize(self): + self.X = np.zeros((self.nr_init, self.nr_policy_weights)) + self.Y = np.zeros((self.nr_init, 1)) + + for i in range(self.nr_init): + self.policy_model.random_policy() + self.X[i, :] = self.policy_model.weights.T + policy = self.policy_model.policy_rollout() + + reward, step_count = self.env.runner(policy) + + self.Y[i] = reward + + self.gp = GaussianProcessRegressor(Matern(nu=1.5)) + self.gp.fit(self.X, self.Y) + + def next_observation(self): + if self.acq == 'ei': + x_next = ExpectedImprovement(self.gp, + self.X, + self.nr_test, + self.nr_policy_weights, + seed=self.policy_seed, + lower=self.lowerb, + upper=self.upperb) + + return x_next + + else: + raise NotImplementedError + + def eval_new_observation(self, x_next): + self.policy_model.weights = x_next + policy = self.policy_model.policy_rollout() + reward, _ = self.env.runner(policy) + + self.X = np.vstack((self.X, x_next)) + self.Y = np.vstack((self.Y, reward)) + + self.gp.fit(self.X, self.Y) + + if self.episode == 0: + self.best_reward[0] = max(self.Y) + else: + self.best_reward = np.vstack((self.best_reward, max(self.Y))) + + self.episode += 1 + + def plot(self): + if self.nr_policy_weights == 1: + plt.scatter(self.X, self.Y) + + x_plot = np.linspace(self.lowerb, self.upperb, 100).reshape(-1, 1) + with catch_warnings(): + simplefilter('ignore') + y_plot, y_sigma = self.gp.predict(x_plot, return_std=True) + + plt.plot(x_plot, y_plot) + plt.fill_between( + x_plot.ravel(), + y_plot - 1.96 * y_sigma, + y_plot + 1.96 * y_sigma, + alpha=0.5 + ) + + plt.show() + + def plot_reward(self): + epsiodes = np.linspace(0, self.episode, self.episode) + plt.plot(epsiodes, self.best_reward) + plt.show() + + + +def main(): + max_step = 50 + car = MountainCarEnv(max_step) + bo = BayesianOptimization(car) + bo.initialize() + iteration_steps = 100 + for i in range(iteration_steps): + x_next = bo.next_observation() + bo.eval_new_observation(x_next) + + bo.plot_reward() + bo.policy_model.plot_policy() + +if __name__ == "__main__": + main() diff --git a/PolicyModel/GaussianModel.py b/PolicyModel/GaussianModel.py new file mode 100644 index 0000000..130e1d1 --- /dev/null +++ b/PolicyModel/GaussianModel.py @@ -0,0 +1,63 @@ +import numpy as np +import matplotlib.pyplot as plt + +class GaussianPolicy: + def __init__(self, nr_weights, nr_steps, seed=None, lowerb=-1.0, upperb=1.0): + self.nr_weights = nr_weights + self.nr_steps = nr_steps + self.weights = None + self.trajectory = None + self.mean = np.linspace(0, self.nr_steps, self.nr_weights) + if nr_weights > 1: + self.std = self.mean[1] / (2 * np.sqrt(2 * np.log(2))) # Full width at half maximum + else: + self.std = self.nr_steps / 2 + + self.rng = np.random.default_rng(seed=seed) + self.low = lowerb + self.upper = upperb + + self.reset() + + def reset(self): + self.weights = np.zeros((self.nr_weights, 1)) + self.trajectory = np.zeros((self.nr_steps, 1)) + + def random_policy(self): + self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights) + + def policy_rollout(self): + self.trajectory = np.zeros((self.nr_steps, 1)) + for i in range(self.nr_steps): + for j in range(self.nr_weights): + base_fun = np.exp(-0.5*(i - self.mean[j])**2 / self.std**2) + self.trajectory[i] += base_fun * self.weights[j] + + return self.trajectory + + def plot_policy(self): + x = np.linspace(0, self.nr_steps, self.nr_steps) + plt.plot(x, self.trajectory) + for i in self.mean: + gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2) + plt.plot(x, gaussian) + + plt.show() + + +def main(): + policy = GaussianPolicy(1, 50) + policy.random_policy() + policy.policy_rollout() + print(policy.weights) + + fig, (ax1, ax2) = plt.subplots(2, 1) + + x = np.linspace(0, policy.nr_steps, policy.nr_steps) + ax1.plot(x, policy.trajectory) + ax2.bar(policy.mean, policy.weights) + + plt.show() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ToyTask/MountainCar.py b/ToyTask/MountainCar.py new file mode 100644 index 0000000..195b753 --- /dev/null +++ b/ToyTask/MountainCar.py @@ -0,0 +1,67 @@ +import numpy as np +from math import cos + +class MountainCarEnv: + def __init__(self, max_step): + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = 0.5 + self.position = 0.0 + self.speed = 0.0 + self.reward = 0.0 + self.step_count = 0 + self.max_step = max_step + self.distance_penaltiy = 100 + + self.reset() + + def reset(self): + self.position = np.random.uniform(low=-0.6, high=-0.4) + self.speed = 0.0 + self.reward = 0.0 + self.step_count = 0 + + def state(self): + return self.position, self.speed + + @staticmethod + def minmax(value, lower, upper): + return lower if value < lower else upper if value > upper else value + + def termination(self): + if self.step_count < self.max_step: + if self.position >= self.goal_position: + return True + else: + self.reward += -1.0 + return False + else: + self.reward += -(self.goal_position - self.position) * self.distance_penaltiy + return True + + def step(self, action): + position, speed = self.state() + speed += (action-1)*0.001 + cos(3*position)*(-0.0025) + speed = self.minmax(speed, -self.max_speed, self.max_speed) + position += speed + position = self.minmax(position, self.min_position, self.max_position) + if (position == self.min_position) and (speed < 0.0): + speed = 0.0 + + self.speed = speed + self.position = position + + self.step_count += 1 + done = self.termination() + + return done, self.reward, self.step_count + + def runner(self, policy): + done = False + self.reset() + while not done: + action = policy[self.step_count] + done, _, _ = self.step(float(action)) + + return self.reward, self.step_count