diff --git a/src/active_bo_ros/active_bo_ros/AcquisitionFunctions/__init__.py b/src/active_bo_ros/active_bo_ros/AcquisitionFunctions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py new file mode 100644 index 0000000..767374a --- /dev/null +++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py @@ -0,0 +1,149 @@ +import numpy as np +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern + +from active_bo_ros.PolicyModel.GaussianRBFModel import GaussianRBF +from active_bo_ros.AcquistionFunctions.ExpectedImprovement import ExpectedImprovement +from active_bo_ros.AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement +from active_bo_ros.AcquistionFunctions.ConfidenceBound import ConfidenceBound + +from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv + + +class BayesianOptimization: + def __init__(self, env, nr_steps, nr_init=3, acq='ei', nr_weights=6, policy_seed=None): + self.env = env + self.nr_init = nr_init + self.acq = acq + + self.X = None + self.Y = None + self.GP = None + + self.episode = 0 + self.counter_array = np.empty((1, 1)) + self.best_reward = np.empty((1, 1)) + self.distance_penalty = 0 + + self.nr_policy_weights = nr_weights + self.nr_steps = nr_steps + self.policy_seed = policy_seed + self.lower_bound = -1.0 + self.upper_bound = 1.0 + + self.policy_model = GaussianRBF(self.nr_policy_weights, + self.nr_steps, + self.policy_seed, + self.lower_bound, + self.upper_bound) + + self.eval_X = 100 + + def reset_bo(self): + self.counter_array = np.empty((1, 1)) + self.GP = None + self.episode = 0 + self.best_reward = np.empty((1, 1)) + + def runner(self, policy): + done = False + env_reward = 0.0 + step_count = 0 + + for i in range(len(policy)): + action = policy[i] + output = self.env.step(action) + + env_reward += output[1] + done = output[2] + step_count += 1 + + if done: + self.counter_array = np.vstack((self.counter_array, step_count)) + break + + if not done and i == len(policy): + distance = -(self.env.goal_position - output[0][0]) + env_reward += distance * self.distance_penalty + self.counter_array = np.vstack((self.counter_array, step_count)) + + self.env.reset() + return env_reward, step_count + + def initialize(self): + self.env.reset() + self.reset_bo() + + self.X = np.zeros((self.nr_init, self.nr_policy_weights)) + self.Y = np.zeros((self.nr_init, 1)) + + for i in range(self.nr_init): + self.policy_model.random_policy() + self.X[i, :] = self.policy_model.weights.T + policy = self.policy_model.rollout() + + reward, step_count = self.runner(policy) + + self.Y[i] = reward + + self.GP = GaussianProcessRegressor(Matern(nu=1.5)) + self.GP.fit(self.X, self.Y) + + def next_observation(self): + if self.acq == 'ei': + x_next = ExpectedImprovement(self.GP, + self.X, + self.eval_X, + self.nr_policy_weights, + kappa=0, + seed=self.policy_seed, + lower=self.lower_bound, + upper=self.upper_bound) + + return x_next + + elif self.acq == 'pi': + x_next = ProbabilityOfImprovement(self.GP, + self.X, + self.eval_X, + self.nr_policy_weights, + kappa=0, + seed=self.policy_seed, + lower=self.lower_bound, + upper=self.upper_bound) + + return x_next + + elif self.acq == 'cb': + x_next = ConfidenceBound(self.GP, + self.X, + self.eval_X, + self.nr_policy_weights, + lam=2.576, + seed=self.policy_seed, + lower=self.lower_bound, + upper=self.upper_bound) + + return x_next + + else: + raise NotImplementedError + + def eval_new_observation(self, x_next): + self.policy_model.weights = x_next + policy = self.policy_model.rollout() + + reward, step_count = self.runner(policy) + + self.X = np.vstack((self.X, x_next)) + self.Y = np.vstack((self.Y, reward)) + + self.GP.fit(self.X, self.Y) + + self.best_reward = np.vstack((self.best_reward, max(self.Y))) + + self.episode += 1 + return step_count + + + diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/__init__.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py index ed791ef..b19d360 100644 --- a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py +++ b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py @@ -24,7 +24,7 @@ class GaussianRBF: def random_policy(self): self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights) - def policy_rollout(self): + def rollout(self): self.policy = np.zeros((self.nr_steps, 1)) for i in range(self.nr_steps): for j in range(self.nr_weights): @@ -33,16 +33,6 @@ class GaussianRBF: return self.policy - # def plot_policy(self, finished=np.NAN): - # x = np.linspace(0, self.nr_steps, self.nr_steps) - # plt.plot(x, self.trajectory) - # if finished != np.NAN: - # plt.vlines(finished, -1, 1, colors='red') - # # for i in self.mean: - # # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2) - # # plt.plot(x, gaussian) - - def main(): policy = GaussianRBFModel(1, 50) policy.random_policy() diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py index 4faeb23..5b0e91a 100644 --- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py @@ -126,8 +126,8 @@ class Continuous_MountainCarEnv(gym.Env): self.render_mode = render_mode - self.screen_width = 600 - self.screen_height = 400 + self.screen_width = 480 + self.screen_height = 320 self.screen = None self.clock = None self.isopen = True @@ -188,7 +188,7 @@ class Continuous_MountainCarEnv(gym.Env): return np.array(self.state, dtype=np.float32), {} def _height(self, xs): - return np.sin(3 * xs) * 0.45 + 0.55 + return np.sin(3 * xs) * 0.45 + 0.5 def render(self): if self.render_mode is None: diff --git a/src/active_bo_ros/active_bo_ros/bo_service.py b/src/active_bo_ros/active_bo_ros/bo_service.py new file mode 100644 index 0000000..e69de29 diff --git a/src/active_bo_ros/active_bo_ros/policy_service.py b/src/active_bo_ros/active_bo_ros/policy_service.py index 73f0de6..a499012 100644 --- a/src/active_bo_ros/active_bo_ros/policy_service.py +++ b/src/active_bo_ros/active_bo_ros/policy_service.py @@ -19,7 +19,7 @@ class PolicyService(Node): policy = GaussianRBF(weight_len, nr_steps) policy.weights = weights - policy.policy_rollout() + policy.rollout() response.policy = policy.policy.flatten().tolist() return response diff --git a/src/active_bo_ros/active_bo_ros/rl_service.py b/src/active_bo_ros/active_bo_ros/rl_service.py index 6934262..07ec28a 100644 --- a/src/active_bo_ros/active_bo_ros/rl_service.py +++ b/src/active_bo_ros/active_bo_ros/rl_service.py @@ -7,7 +7,6 @@ from rclpy.node import Node from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv import numpy as np -import time class RLService(Node): def __init__(self): @@ -44,6 +43,10 @@ class RLService(Node): green = rgb_array[:, :, 1].flatten().tolist() blue = rgb_array[:, :, 2].flatten().tolist() + # red = [255] * 28800 + [0] * 28800 + [0] * 28800 + # green = [0] * 28800 + [255] * 28800 + [0] * 28800 + # blue = [0] * 28800 + [0] * 28800 + [255] * 28800 + # random int data # red = np.random.randint(0, 255, 240000).tolist() @@ -61,10 +64,9 @@ class RLService(Node): if done: break - distance = -(self.env.goal_position - output[0][0]) - reward += distance * self.distance_penalty - - # time.sleep(0.01) + if not done and i == len(policy): + distance = -(self.env.goal_position - output[0][0]) + reward += distance * self.distance_penalty response.reward = reward response.final_step = step_count diff --git a/src/active_bo_ros/launch/bo_service.launch.py b/src/active_bo_ros/launch/bo_service.launch.py new file mode 100644 index 0000000..e69de29