Manual application complete, BO started
This commit is contained in:
parent
5a8a353c0e
commit
916b11e933
@ -0,0 +1,149 @@
|
|||||||
|
import numpy as np
|
||||||
|
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||||
|
from sklearn.gaussian_process.kernels import Matern
|
||||||
|
|
||||||
|
from active_bo_ros.PolicyModel.GaussianRBFModel import GaussianRBF
|
||||||
|
from active_bo_ros.AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
|
||||||
|
from active_bo_ros.AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
|
||||||
|
from active_bo_ros.AcquistionFunctions.ConfidenceBound import ConfidenceBound
|
||||||
|
|
||||||
|
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||||
|
|
||||||
|
|
||||||
|
class BayesianOptimization:
|
||||||
|
def __init__(self, env, nr_steps, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
|
||||||
|
self.env = env
|
||||||
|
self.nr_init = nr_init
|
||||||
|
self.acq = acq
|
||||||
|
|
||||||
|
self.X = None
|
||||||
|
self.Y = None
|
||||||
|
self.GP = None
|
||||||
|
|
||||||
|
self.episode = 0
|
||||||
|
self.counter_array = np.empty((1, 1))
|
||||||
|
self.best_reward = np.empty((1, 1))
|
||||||
|
self.distance_penalty = 0
|
||||||
|
|
||||||
|
self.nr_policy_weights = nr_weights
|
||||||
|
self.nr_steps = nr_steps
|
||||||
|
self.policy_seed = policy_seed
|
||||||
|
self.lower_bound = -1.0
|
||||||
|
self.upper_bound = 1.0
|
||||||
|
|
||||||
|
self.policy_model = GaussianRBF(self.nr_policy_weights,
|
||||||
|
self.nr_steps,
|
||||||
|
self.policy_seed,
|
||||||
|
self.lower_bound,
|
||||||
|
self.upper_bound)
|
||||||
|
|
||||||
|
self.eval_X = 100
|
||||||
|
|
||||||
|
def reset_bo(self):
|
||||||
|
self.counter_array = np.empty((1, 1))
|
||||||
|
self.GP = None
|
||||||
|
self.episode = 0
|
||||||
|
self.best_reward = np.empty((1, 1))
|
||||||
|
|
||||||
|
def runner(self, policy):
|
||||||
|
done = False
|
||||||
|
env_reward = 0.0
|
||||||
|
step_count = 0
|
||||||
|
|
||||||
|
for i in range(len(policy)):
|
||||||
|
action = policy[i]
|
||||||
|
output = self.env.step(action)
|
||||||
|
|
||||||
|
env_reward += output[1]
|
||||||
|
done = output[2]
|
||||||
|
step_count += 1
|
||||||
|
|
||||||
|
if done:
|
||||||
|
self.counter_array = np.vstack((self.counter_array, step_count))
|
||||||
|
break
|
||||||
|
|
||||||
|
if not done and i == len(policy):
|
||||||
|
distance = -(self.env.goal_position - output[0][0])
|
||||||
|
env_reward += distance * self.distance_penalty
|
||||||
|
self.counter_array = np.vstack((self.counter_array, step_count))
|
||||||
|
|
||||||
|
self.env.reset()
|
||||||
|
return env_reward, step_count
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
self.env.reset()
|
||||||
|
self.reset_bo()
|
||||||
|
|
||||||
|
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
|
||||||
|
self.Y = np.zeros((self.nr_init, 1))
|
||||||
|
|
||||||
|
for i in range(self.nr_init):
|
||||||
|
self.policy_model.random_policy()
|
||||||
|
self.X[i, :] = self.policy_model.weights.T
|
||||||
|
policy = self.policy_model.rollout()
|
||||||
|
|
||||||
|
reward, step_count = self.runner(policy)
|
||||||
|
|
||||||
|
self.Y[i] = reward
|
||||||
|
|
||||||
|
self.GP = GaussianProcessRegressor(Matern(nu=1.5))
|
||||||
|
self.GP.fit(self.X, self.Y)
|
||||||
|
|
||||||
|
def next_observation(self):
|
||||||
|
if self.acq == 'ei':
|
||||||
|
x_next = ExpectedImprovement(self.GP,
|
||||||
|
self.X,
|
||||||
|
self.eval_X,
|
||||||
|
self.nr_policy_weights,
|
||||||
|
kappa=0,
|
||||||
|
seed=self.policy_seed,
|
||||||
|
lower=self.lower_bound,
|
||||||
|
upper=self.upper_bound)
|
||||||
|
|
||||||
|
return x_next
|
||||||
|
|
||||||
|
elif self.acq == 'pi':
|
||||||
|
x_next = ProbabilityOfImprovement(self.GP,
|
||||||
|
self.X,
|
||||||
|
self.eval_X,
|
||||||
|
self.nr_policy_weights,
|
||||||
|
kappa=0,
|
||||||
|
seed=self.policy_seed,
|
||||||
|
lower=self.lower_bound,
|
||||||
|
upper=self.upper_bound)
|
||||||
|
|
||||||
|
return x_next
|
||||||
|
|
||||||
|
elif self.acq == 'cb':
|
||||||
|
x_next = ConfidenceBound(self.GP,
|
||||||
|
self.X,
|
||||||
|
self.eval_X,
|
||||||
|
self.nr_policy_weights,
|
||||||
|
lam=2.576,
|
||||||
|
seed=self.policy_seed,
|
||||||
|
lower=self.lower_bound,
|
||||||
|
upper=self.upper_bound)
|
||||||
|
|
||||||
|
return x_next
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def eval_new_observation(self, x_next):
|
||||||
|
self.policy_model.weights = x_next
|
||||||
|
policy = self.policy_model.rollout()
|
||||||
|
|
||||||
|
reward, step_count = self.runner(policy)
|
||||||
|
|
||||||
|
self.X = np.vstack((self.X, x_next))
|
||||||
|
self.Y = np.vstack((self.Y, reward))
|
||||||
|
|
||||||
|
self.GP.fit(self.X, self.Y)
|
||||||
|
|
||||||
|
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
|
||||||
|
|
||||||
|
self.episode += 1
|
||||||
|
return step_count
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -24,7 +24,7 @@ class GaussianRBF:
|
|||||||
def random_policy(self):
|
def random_policy(self):
|
||||||
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
|
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
|
||||||
|
|
||||||
def policy_rollout(self):
|
def rollout(self):
|
||||||
self.policy = np.zeros((self.nr_steps, 1))
|
self.policy = np.zeros((self.nr_steps, 1))
|
||||||
for i in range(self.nr_steps):
|
for i in range(self.nr_steps):
|
||||||
for j in range(self.nr_weights):
|
for j in range(self.nr_weights):
|
||||||
@ -33,16 +33,6 @@ class GaussianRBF:
|
|||||||
|
|
||||||
return self.policy
|
return self.policy
|
||||||
|
|
||||||
# def plot_policy(self, finished=np.NAN):
|
|
||||||
# x = np.linspace(0, self.nr_steps, self.nr_steps)
|
|
||||||
# plt.plot(x, self.trajectory)
|
|
||||||
# if finished != np.NAN:
|
|
||||||
# plt.vlines(finished, -1, 1, colors='red')
|
|
||||||
# # for i in self.mean:
|
|
||||||
# # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
|
|
||||||
# # plt.plot(x, gaussian)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
policy = GaussianRBFModel(1, 50)
|
policy = GaussianRBFModel(1, 50)
|
||||||
policy.random_policy()
|
policy.random_policy()
|
||||||
|
@ -126,8 +126,8 @@ class Continuous_MountainCarEnv(gym.Env):
|
|||||||
|
|
||||||
self.render_mode = render_mode
|
self.render_mode = render_mode
|
||||||
|
|
||||||
self.screen_width = 600
|
self.screen_width = 480
|
||||||
self.screen_height = 400
|
self.screen_height = 320
|
||||||
self.screen = None
|
self.screen = None
|
||||||
self.clock = None
|
self.clock = None
|
||||||
self.isopen = True
|
self.isopen = True
|
||||||
@ -188,7 +188,7 @@ class Continuous_MountainCarEnv(gym.Env):
|
|||||||
return np.array(self.state, dtype=np.float32), {}
|
return np.array(self.state, dtype=np.float32), {}
|
||||||
|
|
||||||
def _height(self, xs):
|
def _height(self, xs):
|
||||||
return np.sin(3 * xs) * 0.45 + 0.55
|
return np.sin(3 * xs) * 0.45 + 0.5
|
||||||
|
|
||||||
def render(self):
|
def render(self):
|
||||||
if self.render_mode is None:
|
if self.render_mode is None:
|
||||||
|
0
src/active_bo_ros/active_bo_ros/bo_service.py
Normal file
0
src/active_bo_ros/active_bo_ros/bo_service.py
Normal file
@ -19,7 +19,7 @@ class PolicyService(Node):
|
|||||||
|
|
||||||
policy = GaussianRBF(weight_len, nr_steps)
|
policy = GaussianRBF(weight_len, nr_steps)
|
||||||
policy.weights = weights
|
policy.weights = weights
|
||||||
policy.policy_rollout()
|
policy.rollout()
|
||||||
|
|
||||||
response.policy = policy.policy.flatten().tolist()
|
response.policy = policy.policy.flatten().tolist()
|
||||||
return response
|
return response
|
||||||
|
@ -7,7 +7,6 @@ from rclpy.node import Node
|
|||||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
|
||||||
|
|
||||||
class RLService(Node):
|
class RLService(Node):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -44,6 +43,10 @@ class RLService(Node):
|
|||||||
green = rgb_array[:, :, 1].flatten().tolist()
|
green = rgb_array[:, :, 1].flatten().tolist()
|
||||||
blue = rgb_array[:, :, 2].flatten().tolist()
|
blue = rgb_array[:, :, 2].flatten().tolist()
|
||||||
|
|
||||||
|
# red = [255] * 28800 + [0] * 28800 + [0] * 28800
|
||||||
|
# green = [0] * 28800 + [255] * 28800 + [0] * 28800
|
||||||
|
# blue = [0] * 28800 + [0] * 28800 + [255] * 28800
|
||||||
|
|
||||||
|
|
||||||
# random int data
|
# random int data
|
||||||
# red = np.random.randint(0, 255, 240000).tolist()
|
# red = np.random.randint(0, 255, 240000).tolist()
|
||||||
@ -61,10 +64,9 @@ class RLService(Node):
|
|||||||
if done:
|
if done:
|
||||||
break
|
break
|
||||||
|
|
||||||
distance = -(self.env.goal_position - output[0][0])
|
if not done and i == len(policy):
|
||||||
reward += distance * self.distance_penalty
|
distance = -(self.env.goal_position - output[0][0])
|
||||||
|
reward += distance * self.distance_penalty
|
||||||
# time.sleep(0.01)
|
|
||||||
|
|
||||||
response.reward = reward
|
response.reward = reward
|
||||||
response.final_step = step_count
|
response.final_step = step_count
|
||||||
|
0
src/active_bo_ros/launch/bo_service.launch.py
Normal file
0
src/active_bo_ros/launch/bo_service.launch.py
Normal file
Loading…
Reference in New Issue
Block a user