Added BOGymRunner.py

2023-02-15 16:03:03 +01:00 · 2023-02-15 16:03:03 +01:00 · d568523c1a
commit d568523c1a
parent caf76c6f9b
2 changed files with 105 additions and 3 deletions
--- a/BayesianOptimization/BOwithGym.py
+++ b/BayesianOptimization/BOwithGym.py
@ -41,8 +41,15 @@ class BayesianOptimization:
        self.nr_test = 100
    def reset_bo(self):
        self.counter_array = np.zeros((1, 1))
        self.gp = None
        self.episode = 0
        self.best_reward = np.empty((1, 1))
    def initialize(self):
        self.env.reset()
        self.reset_bo()
        if self.env.render_mode == 'human':
            self.env.render()
@ -177,14 +184,17 @@ class BayesianOptimization:
        )
        plt.show()
-    def get_best_result(self):
+    def get_best_result(self, plotter=True):
        y_hat = self.gp.predict(self.X)
        idx = np.argmax(y_hat)
        x_max = self.X[idx, :]
        self.policy_model.weights = x_max
        self.policy_model.policy_rollout()
        if plotter:
            print(self.counter_array[idx], idx)
            self.policy_model.plot_policy(finished=self.counter_array[idx])
        else:
            return self.counter_array[idx]
 def main():
    nr_steps = 100
--- a/runner/BOGymRunner.py
+++ b/runner/BOGymRunner.py
@ -0,0 +1,92 @@
 from BayesianOptimization.BOwithGym import BayesianOptimization
 from ToyTask.MountainCarGym import Continuous_MountainCarEnv
 import numpy as np
 import matplotlib.pyplot as plt
 # BO parameters
 env  = Continuous_MountainCarEnv()
 nr_steps = 100
 acquisition_fun = 'ei'
 iteration_steps = 500
 nr_runs = 20
 # storage arrays
 finished_store = np.zeros((1, nr_runs))
 best_policy = np.zeros((nr_steps, nr_runs))
 reward_store = np.zeros((iteration_steps, nr_runs))
 # post-processing
 def post_processing(finished, policy, reward):
    finish_mean = np.nanmean(finished)
    finish_std = np.nanstd(finished)
    policy_mean = np.mean(policy, axis=1)
    policy_std = np.std(policy, axis=1)
    reward_mean = np.mean(reward, axis=1)
    reward_std = np.std(reward, axis=1)
    return finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std
 # plot functions
 def plot_policy(mean, std, fin_mean, fin_std):
    x = np.linspace(0, mean.shape[0], mean.shape[0])
    plt.plot(x, mean)
    plt.fill_between(
        x,
        mean - 1.96 * std,
        mean + 1.96 * std,
        alpha=0.5
    )
    y = np.linspace(-2, 2, 50)
    plt.vlines(fin_mean, -2, 2, colors='red')
    plt.fill_betweenx(
        y,
        fin_mean - 1.96 * fin_std,
        fin_mean + 1.96 * fin_std,
        alpha=0.5,
    )
    plt.show()
 def plot_reward(mean, std):
    eps = np.linspace(0, mean.shape[0], mean.shape[0])
    plt.plot(eps, mean)
    plt.fill_between(
        eps,
        mean - 1.96 * std,
        mean + 1.96 * std,
        alpha=0.5
    )
    plt.show()
 # main
 def main():
    global finished_store, best_policy, reward_store
    bo = BayesianOptimization(env, nr_steps, acq=acquisition_fun)
    for i in range(nr_runs):
        print('Iteration:', str(i))
        bo.initialize()
        for j in range(iteration_steps):
            x_next = bo.next_observation()
            bo.eval_new_observation(x_next)
        finished = bo.get_best_result(plotter=False)
        finished_store[:, i] = finished
        best_policy[:, i] = bo.policy_model.trajectory.T
        reward_store[:, i] = bo.best_reward.T
    finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std = post_processing(finished_store,
                                                                                                best_policy,
                                                                                                reward_store)
    plot_policy(policy_mean, policy_std, finish_mean, finish_std)
    plot_reward(reward_mean, reward_std)
 if __name__ == '__main__':
    main()