Manual application complete, BO started

This commit is contained in:
Niko Feith 2023-03-06 15:43:32 +01:00
parent 5a8a353c0e
commit 916b11e933
9 changed files with 161 additions and 20 deletions

View File

@ -0,0 +1,149 @@
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from active_bo_ros.PolicyModel.GaussianRBFModel import GaussianRBF
from active_bo_ros.AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
from active_bo_ros.AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
from active_bo_ros.AcquistionFunctions.ConfidenceBound import ConfidenceBound
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
class BayesianOptimization:
def __init__(self, env, nr_steps, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
self.env = env
self.nr_init = nr_init
self.acq = acq
self.X = None
self.Y = None
self.GP = None
self.episode = 0
self.counter_array = np.empty((1, 1))
self.best_reward = np.empty((1, 1))
self.distance_penalty = 0
self.nr_policy_weights = nr_weights
self.nr_steps = nr_steps
self.policy_seed = policy_seed
self.lower_bound = -1.0
self.upper_bound = 1.0
self.policy_model = GaussianRBF(self.nr_policy_weights,
self.nr_steps,
self.policy_seed,
self.lower_bound,
self.upper_bound)
self.eval_X = 100
def reset_bo(self):
self.counter_array = np.empty((1, 1))
self.GP = None
self.episode = 0
self.best_reward = np.empty((1, 1))
def runner(self, policy):
done = False
env_reward = 0.0
step_count = 0
for i in range(len(policy)):
action = policy[i]
output = self.env.step(action)
env_reward += output[1]
done = output[2]
step_count += 1
if done:
self.counter_array = np.vstack((self.counter_array, step_count))
break
if not done and i == len(policy):
distance = -(self.env.goal_position - output[0][0])
env_reward += distance * self.distance_penalty
self.counter_array = np.vstack((self.counter_array, step_count))
self.env.reset()
return env_reward, step_count
def initialize(self):
self.env.reset()
self.reset_bo()
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
self.Y = np.zeros((self.nr_init, 1))
for i in range(self.nr_init):
self.policy_model.random_policy()
self.X[i, :] = self.policy_model.weights.T
policy = self.policy_model.rollout()
reward, step_count = self.runner(policy)
self.Y[i] = reward
self.GP = GaussianProcessRegressor(Matern(nu=1.5))
self.GP.fit(self.X, self.Y)
def next_observation(self):
if self.acq == 'ei':
x_next = ExpectedImprovement(self.GP,
self.X,
self.eval_X,
self.nr_policy_weights,
kappa=0,
seed=self.policy_seed,
lower=self.lower_bound,
upper=self.upper_bound)
return x_next
elif self.acq == 'pi':
x_next = ProbabilityOfImprovement(self.GP,
self.X,
self.eval_X,
self.nr_policy_weights,
kappa=0,
seed=self.policy_seed,
lower=self.lower_bound,
upper=self.upper_bound)
return x_next
elif self.acq == 'cb':
x_next = ConfidenceBound(self.GP,
self.X,
self.eval_X,
self.nr_policy_weights,
lam=2.576,
seed=self.policy_seed,
lower=self.lower_bound,
upper=self.upper_bound)
return x_next
else:
raise NotImplementedError
def eval_new_observation(self, x_next):
self.policy_model.weights = x_next
policy = self.policy_model.rollout()
reward, step_count = self.runner(policy)
self.X = np.vstack((self.X, x_next))
self.Y = np.vstack((self.Y, reward))
self.GP.fit(self.X, self.Y)
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
self.episode += 1
return step_count

View File

@ -24,7 +24,7 @@ class GaussianRBF:
def random_policy(self): def random_policy(self):
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights) self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
def policy_rollout(self): def rollout(self):
self.policy = np.zeros((self.nr_steps, 1)) self.policy = np.zeros((self.nr_steps, 1))
for i in range(self.nr_steps): for i in range(self.nr_steps):
for j in range(self.nr_weights): for j in range(self.nr_weights):
@ -33,16 +33,6 @@ class GaussianRBF:
return self.policy return self.policy
# def plot_policy(self, finished=np.NAN):
# x = np.linspace(0, self.nr_steps, self.nr_steps)
# plt.plot(x, self.trajectory)
# if finished != np.NAN:
# plt.vlines(finished, -1, 1, colors='red')
# # for i in self.mean:
# # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
# # plt.plot(x, gaussian)
def main(): def main():
policy = GaussianRBFModel(1, 50) policy = GaussianRBFModel(1, 50)
policy.random_policy() policy.random_policy()

View File

@ -126,8 +126,8 @@ class Continuous_MountainCarEnv(gym.Env):
self.render_mode = render_mode self.render_mode = render_mode
self.screen_width = 600 self.screen_width = 480
self.screen_height = 400 self.screen_height = 320
self.screen = None self.screen = None
self.clock = None self.clock = None
self.isopen = True self.isopen = True
@ -188,7 +188,7 @@ class Continuous_MountainCarEnv(gym.Env):
return np.array(self.state, dtype=np.float32), {} return np.array(self.state, dtype=np.float32), {}
def _height(self, xs): def _height(self, xs):
return np.sin(3 * xs) * 0.45 + 0.55 return np.sin(3 * xs) * 0.45 + 0.5
def render(self): def render(self):
if self.render_mode is None: if self.render_mode is None:

View File

@ -19,7 +19,7 @@ class PolicyService(Node):
policy = GaussianRBF(weight_len, nr_steps) policy = GaussianRBF(weight_len, nr_steps)
policy.weights = weights policy.weights = weights
policy.policy_rollout() policy.rollout()
response.policy = policy.policy.flatten().tolist() response.policy = policy.policy.flatten().tolist()
return response return response

View File

@ -7,7 +7,6 @@ from rclpy.node import Node
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
import numpy as np import numpy as np
import time
class RLService(Node): class RLService(Node):
def __init__(self): def __init__(self):
@ -44,6 +43,10 @@ class RLService(Node):
green = rgb_array[:, :, 1].flatten().tolist() green = rgb_array[:, :, 1].flatten().tolist()
blue = rgb_array[:, :, 2].flatten().tolist() blue = rgb_array[:, :, 2].flatten().tolist()
# red = [255] * 28800 + [0] * 28800 + [0] * 28800
# green = [0] * 28800 + [255] * 28800 + [0] * 28800
# blue = [0] * 28800 + [0] * 28800 + [255] * 28800
# random int data # random int data
# red = np.random.randint(0, 255, 240000).tolist() # red = np.random.randint(0, 255, 240000).tolist()
@ -61,11 +64,10 @@ class RLService(Node):
if done: if done:
break break
if not done and i == len(policy):
distance = -(self.env.goal_position - output[0][0]) distance = -(self.env.goal_position - output[0][0])
reward += distance * self.distance_penalty reward += distance * self.distance_penalty
# time.sleep(0.01)
response.reward = reward response.reward = reward
response.final_step = step_count response.final_step = step_count