diff --git a/BayesianOptimization/BOwithGym.py b/BayesianOptimization/BOwithGym.py index 043ea43..08e3a98 100644 --- a/BayesianOptimization/BOwithGym.py +++ b/BayesianOptimization/BOwithGym.py @@ -41,8 +41,15 @@ class BayesianOptimization: self.nr_test = 100 + def reset_bo(self): + self.counter_array = np.zeros((1, 1)) + self.gp = None + self.episode = 0 + self.best_reward = np.empty((1, 1)) + def initialize(self): self.env.reset() + self.reset_bo() if self.env.render_mode == 'human': self.env.render() @@ -177,14 +184,17 @@ class BayesianOptimization: ) plt.show() - def get_best_result(self): + def get_best_result(self, plotter=True): y_hat = self.gp.predict(self.X) idx = np.argmax(y_hat) x_max = self.X[idx, :] self.policy_model.weights = x_max self.policy_model.policy_rollout() - print(self.counter_array[idx], idx) - self.policy_model.plot_policy(finished=self.counter_array[idx]) + if plotter: + print(self.counter_array[idx], idx) + self.policy_model.plot_policy(finished=self.counter_array[idx]) + else: + return self.counter_array[idx] def main(): nr_steps = 100 diff --git a/runner/BOGymRunner.py b/runner/BOGymRunner.py index e69de29..25a43e1 100644 --- a/runner/BOGymRunner.py +++ b/runner/BOGymRunner.py @@ -0,0 +1,92 @@ +from BayesianOptimization.BOwithGym import BayesianOptimization +from ToyTask.MountainCarGym import Continuous_MountainCarEnv +import numpy as np + +import matplotlib.pyplot as plt + +# BO parameters +env = Continuous_MountainCarEnv() +nr_steps = 100 +acquisition_fun = 'ei' +iteration_steps = 500 + +nr_runs = 20 + +# storage arrays +finished_store = np.zeros((1, nr_runs)) +best_policy = np.zeros((nr_steps, nr_runs)) +reward_store = np.zeros((iteration_steps, nr_runs)) + +# post-processing +def post_processing(finished, policy, reward): + + finish_mean = np.nanmean(finished) + finish_std = np.nanstd(finished) + + policy_mean = np.mean(policy, axis=1) + policy_std = np.std(policy, axis=1) + + reward_mean = np.mean(reward, axis=1) + reward_std = np.std(reward, axis=1) + + return finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std + +# plot functions +def plot_policy(mean, std, fin_mean, fin_std): + x = np.linspace(0, mean.shape[0], mean.shape[0]) + plt.plot(x, mean) + plt.fill_between( + x, + mean - 1.96 * std, + mean + 1.96 * std, + alpha=0.5 + ) + + y = np.linspace(-2, 2, 50) + plt.vlines(fin_mean, -2, 2, colors='red') + plt.fill_betweenx( + y, + fin_mean - 1.96 * fin_std, + fin_mean + 1.96 * fin_std, + alpha=0.5, + ) + + plt.show() + +def plot_reward(mean, std): + eps = np.linspace(0, mean.shape[0], mean.shape[0]) + plt.plot(eps, mean) + + plt.fill_between( + eps, + mean - 1.96 * std, + mean + 1.96 * std, + alpha=0.5 + ) + plt.show() + +# main +def main(): + global finished_store, best_policy, reward_store + bo = BayesianOptimization(env, nr_steps, acq=acquisition_fun) + for i in range(nr_runs): + print('Iteration:', str(i)) + bo.initialize() + for j in range(iteration_steps): + x_next = bo.next_observation() + bo.eval_new_observation(x_next) + + finished = bo.get_best_result(plotter=False) + + finished_store[:, i] = finished + best_policy[:, i] = bo.policy_model.trajectory.T + reward_store[:, i] = bo.best_reward.T + + finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std = post_processing(finished_store, + best_policy, + reward_store) + plot_policy(policy_mean, policy_std, finish_mean, finish_std) + plot_reward(reward_mean, reward_std) + +if __name__ == '__main__': + main() \ No newline at end of file