Manual application complete, BO started
This commit is contained in:
parent
5a8a353c0e
commit
916b11e933
@ -0,0 +1,149 @@
|
||||
import numpy as np
|
||||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
from sklearn.gaussian_process.kernels import Matern
|
||||
|
||||
from active_bo_ros.PolicyModel.GaussianRBFModel import GaussianRBF
|
||||
from active_bo_ros.AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
|
||||
from active_bo_ros.AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
|
||||
from active_bo_ros.AcquistionFunctions.ConfidenceBound import ConfidenceBound
|
||||
|
||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||
|
||||
|
||||
class BayesianOptimization:
|
||||
def __init__(self, env, nr_steps, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
|
||||
self.env = env
|
||||
self.nr_init = nr_init
|
||||
self.acq = acq
|
||||
|
||||
self.X = None
|
||||
self.Y = None
|
||||
self.GP = None
|
||||
|
||||
self.episode = 0
|
||||
self.counter_array = np.empty((1, 1))
|
||||
self.best_reward = np.empty((1, 1))
|
||||
self.distance_penalty = 0
|
||||
|
||||
self.nr_policy_weights = nr_weights
|
||||
self.nr_steps = nr_steps
|
||||
self.policy_seed = policy_seed
|
||||
self.lower_bound = -1.0
|
||||
self.upper_bound = 1.0
|
||||
|
||||
self.policy_model = GaussianRBF(self.nr_policy_weights,
|
||||
self.nr_steps,
|
||||
self.policy_seed,
|
||||
self.lower_bound,
|
||||
self.upper_bound)
|
||||
|
||||
self.eval_X = 100
|
||||
|
||||
def reset_bo(self):
|
||||
self.counter_array = np.empty((1, 1))
|
||||
self.GP = None
|
||||
self.episode = 0
|
||||
self.best_reward = np.empty((1, 1))
|
||||
|
||||
def runner(self, policy):
|
||||
done = False
|
||||
env_reward = 0.0
|
||||
step_count = 0
|
||||
|
||||
for i in range(len(policy)):
|
||||
action = policy[i]
|
||||
output = self.env.step(action)
|
||||
|
||||
env_reward += output[1]
|
||||
done = output[2]
|
||||
step_count += 1
|
||||
|
||||
if done:
|
||||
self.counter_array = np.vstack((self.counter_array, step_count))
|
||||
break
|
||||
|
||||
if not done and i == len(policy):
|
||||
distance = -(self.env.goal_position - output[0][0])
|
||||
env_reward += distance * self.distance_penalty
|
||||
self.counter_array = np.vstack((self.counter_array, step_count))
|
||||
|
||||
self.env.reset()
|
||||
return env_reward, step_count
|
||||
|
||||
def initialize(self):
|
||||
self.env.reset()
|
||||
self.reset_bo()
|
||||
|
||||
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
|
||||
self.Y = np.zeros((self.nr_init, 1))
|
||||
|
||||
for i in range(self.nr_init):
|
||||
self.policy_model.random_policy()
|
||||
self.X[i, :] = self.policy_model.weights.T
|
||||
policy = self.policy_model.rollout()
|
||||
|
||||
reward, step_count = self.runner(policy)
|
||||
|
||||
self.Y[i] = reward
|
||||
|
||||
self.GP = GaussianProcessRegressor(Matern(nu=1.5))
|
||||
self.GP.fit(self.X, self.Y)
|
||||
|
||||
def next_observation(self):
|
||||
if self.acq == 'ei':
|
||||
x_next = ExpectedImprovement(self.GP,
|
||||
self.X,
|
||||
self.eval_X,
|
||||
self.nr_policy_weights,
|
||||
kappa=0,
|
||||
seed=self.policy_seed,
|
||||
lower=self.lower_bound,
|
||||
upper=self.upper_bound)
|
||||
|
||||
return x_next
|
||||
|
||||
elif self.acq == 'pi':
|
||||
x_next = ProbabilityOfImprovement(self.GP,
|
||||
self.X,
|
||||
self.eval_X,
|
||||
self.nr_policy_weights,
|
||||
kappa=0,
|
||||
seed=self.policy_seed,
|
||||
lower=self.lower_bound,
|
||||
upper=self.upper_bound)
|
||||
|
||||
return x_next
|
||||
|
||||
elif self.acq == 'cb':
|
||||
x_next = ConfidenceBound(self.GP,
|
||||
self.X,
|
||||
self.eval_X,
|
||||
self.nr_policy_weights,
|
||||
lam=2.576,
|
||||
seed=self.policy_seed,
|
||||
lower=self.lower_bound,
|
||||
upper=self.upper_bound)
|
||||
|
||||
return x_next
|
||||
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def eval_new_observation(self, x_next):
|
||||
self.policy_model.weights = x_next
|
||||
policy = self.policy_model.rollout()
|
||||
|
||||
reward, step_count = self.runner(policy)
|
||||
|
||||
self.X = np.vstack((self.X, x_next))
|
||||
self.Y = np.vstack((self.Y, reward))
|
||||
|
||||
self.GP.fit(self.X, self.Y)
|
||||
|
||||
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
|
||||
|
||||
self.episode += 1
|
||||
return step_count
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ class GaussianRBF:
|
||||
def random_policy(self):
|
||||
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
|
||||
|
||||
def policy_rollout(self):
|
||||
def rollout(self):
|
||||
self.policy = np.zeros((self.nr_steps, 1))
|
||||
for i in range(self.nr_steps):
|
||||
for j in range(self.nr_weights):
|
||||
@ -33,16 +33,6 @@ class GaussianRBF:
|
||||
|
||||
return self.policy
|
||||
|
||||
# def plot_policy(self, finished=np.NAN):
|
||||
# x = np.linspace(0, self.nr_steps, self.nr_steps)
|
||||
# plt.plot(x, self.trajectory)
|
||||
# if finished != np.NAN:
|
||||
# plt.vlines(finished, -1, 1, colors='red')
|
||||
# # for i in self.mean:
|
||||
# # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
|
||||
# # plt.plot(x, gaussian)
|
||||
|
||||
|
||||
def main():
|
||||
policy = GaussianRBFModel(1, 50)
|
||||
policy.random_policy()
|
||||
|
@ -126,8 +126,8 @@ class Continuous_MountainCarEnv(gym.Env):
|
||||
|
||||
self.render_mode = render_mode
|
||||
|
||||
self.screen_width = 600
|
||||
self.screen_height = 400
|
||||
self.screen_width = 480
|
||||
self.screen_height = 320
|
||||
self.screen = None
|
||||
self.clock = None
|
||||
self.isopen = True
|
||||
@ -188,7 +188,7 @@ class Continuous_MountainCarEnv(gym.Env):
|
||||
return np.array(self.state, dtype=np.float32), {}
|
||||
|
||||
def _height(self, xs):
|
||||
return np.sin(3 * xs) * 0.45 + 0.55
|
||||
return np.sin(3 * xs) * 0.45 + 0.5
|
||||
|
||||
def render(self):
|
||||
if self.render_mode is None:
|
||||
|
0
src/active_bo_ros/active_bo_ros/bo_service.py
Normal file
0
src/active_bo_ros/active_bo_ros/bo_service.py
Normal file
@ -19,7 +19,7 @@ class PolicyService(Node):
|
||||
|
||||
policy = GaussianRBF(weight_len, nr_steps)
|
||||
policy.weights = weights
|
||||
policy.policy_rollout()
|
||||
policy.rollout()
|
||||
|
||||
response.policy = policy.policy.flatten().tolist()
|
||||
return response
|
||||
|
@ -7,7 +7,6 @@ from rclpy.node import Node
|
||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
class RLService(Node):
|
||||
def __init__(self):
|
||||
@ -44,6 +43,10 @@ class RLService(Node):
|
||||
green = rgb_array[:, :, 1].flatten().tolist()
|
||||
blue = rgb_array[:, :, 2].flatten().tolist()
|
||||
|
||||
# red = [255] * 28800 + [0] * 28800 + [0] * 28800
|
||||
# green = [0] * 28800 + [255] * 28800 + [0] * 28800
|
||||
# blue = [0] * 28800 + [0] * 28800 + [255] * 28800
|
||||
|
||||
|
||||
# random int data
|
||||
# red = np.random.randint(0, 255, 240000).tolist()
|
||||
@ -61,11 +64,10 @@ class RLService(Node):
|
||||
if done:
|
||||
break
|
||||
|
||||
if not done and i == len(policy):
|
||||
distance = -(self.env.goal_position - output[0][0])
|
||||
reward += distance * self.distance_penalty
|
||||
|
||||
# time.sleep(0.01)
|
||||
|
||||
response.reward = reward
|
||||
response.final_step = step_count
|
||||
|
||||
|
0
src/active_bo_ros/launch/bo_service.launch.py
Normal file
0
src/active_bo_ros/launch/bo_service.launch.py
Normal file
Loading…
Reference in New Issue
Block a user