Manual application complete, BO started

2023-03-06 15:43:32 +01:00 · 2023-03-06 15:43:32 +01:00 · 916b11e933
commit 916b11e933
parent 5a8a353c0e
9 changed files with 161 additions and 20 deletions
--- a/src/active_bo_ros/active_bo_ros/AcquisitionFunctions/init.py
+++ b/src/active_bo_ros/active_bo_ros/AcquisitionFunctions/init.py
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
@ -0,0 +1,149 @@
+import numpy as np
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import Matern
+
+from active_bo_ros.PolicyModel.GaussianRBFModel import GaussianRBF
+from active_bo_ros.AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
+from active_bo_ros.AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
+from active_bo_ros.AcquistionFunctions.ConfidenceBound import ConfidenceBound
+
+from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
+
+
+class BayesianOptimization:
+    def __init__(self, env, nr_steps, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
+        self.env = env
+        self.nr_init = nr_init
+        self.acq = acq
+
+        self.X = None
+        self.Y = None
+        self.GP = None
+
+        self.episode = 0
+        self.counter_array = np.empty((1, 1))
+        self.best_reward = np.empty((1, 1))
+        self.distance_penalty = 0
+
+        self.nr_policy_weights = nr_weights
+        self.nr_steps = nr_steps
+        self.policy_seed = policy_seed
+        self.lower_bound = -1.0
+        self.upper_bound = 1.0
+
+        self.policy_model = GaussianRBF(self.nr_policy_weights,
+                                        self.nr_steps,
+                                        self.policy_seed,
+                                        self.lower_bound,
+                                        self.upper_bound)
+
+        self.eval_X = 100
+
+    def reset_bo(self):
+        self.counter_array = np.empty((1, 1))
+        self.GP = None
+        self.episode = 0
+        self.best_reward = np.empty((1, 1))
+
+    def runner(self, policy):
+        done = False
+        env_reward = 0.0
+        step_count = 0
+
+        for i in range(len(policy)):
+            action = policy[i]
+            output = self.env.step(action)
+
+            env_reward += output[1]
+            done = output[2]
+            step_count += 1
+
+            if done:
+                self.counter_array = np.vstack((self.counter_array, step_count))
+                break
+
+            if not done and i == len(policy):
+                distance = -(self.env.goal_position - output[0][0])
+                env_reward += distance * self.distance_penalty
+                self.counter_array = np.vstack((self.counter_array, step_count))
+
+        self.env.reset()
+        return env_reward, step_count
+
+    def initialize(self):
+        self.env.reset()
+        self.reset_bo()
+
+        self.X = np.zeros((self.nr_init, self.nr_policy_weights))
+        self.Y = np.zeros((self.nr_init, 1))
+
+        for i in range(self.nr_init):
+            self.policy_model.random_policy()
+            self.X[i, :] = self.policy_model.weights.T
+            policy = self.policy_model.rollout()
+
+            reward, step_count = self.runner(policy)
+
+            self.Y[i] = reward
+
+        self.GP = GaussianProcessRegressor(Matern(nu=1.5))
+        self.GP.fit(self.X, self.Y)
+
+    def next_observation(self):
+        if self.acq == 'ei':
+            x_next = ExpectedImprovement(self.GP,
+                                         self.X,
+                                         self.eval_X,
+                                         self.nr_policy_weights,
+                                         kappa=0,
+                                         seed=self.policy_seed,
+                                         lower=self.lower_bound,
+                                         upper=self.upper_bound)
+
+            return x_next
+
+        elif self.acq == 'pi':
+            x_next = ProbabilityOfImprovement(self.GP,
+                                              self.X,
+                                              self.eval_X,
+                                              self.nr_policy_weights,
+                                              kappa=0,
+                                              seed=self.policy_seed,
+                                              lower=self.lower_bound,
+                                              upper=self.upper_bound)
+
+            return x_next
+
+        elif self.acq == 'cb':
+            x_next = ConfidenceBound(self.GP,
+                                     self.X,
+                                     self.eval_X,
+                                     self.nr_policy_weights,
+                                     lam=2.576,
+                                     seed=self.policy_seed,
+                                     lower=self.lower_bound,
+                                     upper=self.upper_bound)
+
+            return x_next
+
+        else:
+            raise NotImplementedError
+
+    def eval_new_observation(self, x_next):
+        self.policy_model.weights = x_next
+        policy = self.policy_model.rollout()
+
+        reward, step_count = self.runner(policy)
+
+        self.X = np.vstack((self.X, x_next))
+        self.Y = np.vstack((self.Y, reward))
+
+        self.GP.fit(self.X, self.Y)
+
+        self.best_reward = np.vstack((self.best_reward, max(self.Y)))
+
+        self.episode += 1
+        return step_count
+
+
+
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/init.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/init.py
--- a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
+++ b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
@ -24,7 +24,7 @@ class GaussianRBF:
    def random_policy(self):
        self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)

-    def policy_rollout(self):
+    def rollout(self):
        self.policy = np.zeros((self.nr_steps, 1))
        for i in range(self.nr_steps):
            for j in range(self.nr_weights):
@ -33,16 +33,6 @@ class GaussianRBF:

        return self.policy

-    # def plot_policy(self, finished=np.NAN):
-    #     x = np.linspace(0, self.nr_steps, self.nr_steps)
-    #     plt.plot(x, self.trajectory)
-    #     if finished != np.NAN:
-    #         plt.vlines(finished, -1, 1, colors='red')
-    #     # for i in self.mean:
-    #     #     gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
-    #     #     plt.plot(x, gaussian)
-
-
 def main():
    policy = GaussianRBFModel(1, 50)
    policy.random_policy()
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py
@ -126,8 +126,8 @@ class Continuous_MountainCarEnv(gym.Env):

        self.render_mode = render_mode

-        self.screen_width = 600
-        self.screen_height = 400
+        self.screen_width = 480
+        self.screen_height = 320
        self.screen = None
        self.clock = None
        self.isopen = True
@ -188,7 +188,7 @@ class Continuous_MountainCarEnv(gym.Env):
        return np.array(self.state, dtype=np.float32), {}

    def _height(self, xs):
-        return np.sin(3 * xs) * 0.45 + 0.55
+        return np.sin(3 * xs) * 0.45 + 0.5

    def render(self):
        if self.render_mode is None:
--- a/src/active_bo_ros/active_bo_ros/bo_service.py
+++ b/src/active_bo_ros/active_bo_ros/bo_service.py
--- a/src/active_bo_ros/active_bo_ros/policy_service.py
+++ b/src/active_bo_ros/active_bo_ros/policy_service.py
@ -19,7 +19,7 @@ class PolicyService(Node):

        policy = GaussianRBF(weight_len, nr_steps)
        policy.weights = weights
-        policy.policy_rollout()
+        policy.rollout()

        response.policy = policy.policy.flatten().tolist()
        return response
--- a/src/active_bo_ros/active_bo_ros/rl_service.py
+++ b/src/active_bo_ros/active_bo_ros/rl_service.py
@ -7,7 +7,6 @@ from rclpy.node import Node
 from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv

 import numpy as np
-import time

 class RLService(Node):
    def __init__(self):
@ -44,6 +43,10 @@ class RLService(Node):
            green = rgb_array[:, :, 1].flatten().tolist()
            blue = rgb_array[:, :, 2].flatten().tolist()

+            # red = [255] * 28800 + [0] * 28800 + [0] * 28800
+            # green = [0] * 28800 + [255] * 28800 + [0] * 28800
+            # blue = [0] * 28800 + [0] * 28800 + [255] * 28800
+

            # random int data
            # red = np.random.randint(0, 255, 240000).tolist()
@ -61,11 +64,10 @@ class RLService(Node):
            if done:
                break

+            if not done and i == len(policy):
                distance = -(self.env.goal_position - output[0][0])
                reward += distance * self.distance_penalty

-            # time.sleep(0.01)
-
        response.reward = reward
        response.final_step = step_count

--- a/src/active_bo_ros/launch/bo_service.launch.py
+++ b/src/active_bo_ros/launch/bo_service.launch.py