added more Acquisition functions

This commit is contained in:
Niko Feith 2023-02-06 15:43:30 +01:00
parent 1f4f878783
commit 78eccba14b
4 changed files with 119 additions and 16 deletions

View File

@ -0,0 +1,14 @@
import numpy as np
from scipy.stats import norm
def ConfidenceBound(gp, X, nr_test, nr_weights, lam=1.2, seed=None, lower=-1.0, upper=1.0):
y_hat = gp.predict(X)
best_y = max(y_hat)
rng = np.random.default_rng(seed=seed)
X_test = rng.uniform(lower, upper, (nr_test, nr_weights))
mu, sigma = gp.predict(X_test, return_std=True)
cb = mu + lam * sigma
idx = np.argmax(cb)
X_next = X_test[idx, :]
return X_next

View File

@ -0,0 +1,15 @@
import numpy as np
from scipy.stats import norm
def ProbabilityOfImprovement(gp, X, nr_test, nr_weights, kappa=2.576, seed=None, lower=-1.0, upper=1.0):
y_hat = gp.predict(X)
best_y = max(y_hat)
rng = np.random.default_rng(seed=seed)
X_test = rng.uniform(lower, upper, (nr_test, nr_weights))
mu, sigma = gp.predict(X_test, return_std=True)
z = (mu - best_y - kappa) / sigma
pi = norm.cdf(z)
idx = np.argmax(pi)
X_next = X_test[idx, :]
return X_next

View File

@ -4,6 +4,8 @@ from sklearn.gaussian_process.kernels import Matern
from PolicyModel.GaussianModel import GaussianPolicy from PolicyModel.GaussianModel import GaussianPolicy
from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
from AcquistionFunctions.ConfidenceBound import ConfidenceBound
from ToyTask.MountainCarGym import Continuous_MountainCarEnv from ToyTask.MountainCarGym import Continuous_MountainCarEnv
@ -12,6 +14,7 @@ import time
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
class BayesianOptimization: class BayesianOptimization:
def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None): def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
self.env = env self.env = env
@ -19,11 +22,12 @@ class BayesianOptimization:
self.acq = acq self.acq = acq
self.X = None self.X = None
self.Y = None self.Y = None
self.counter_array = np.zeros((1, 1))
self.gp = None self.gp = None
self.episode = 0 self.episode = 0
self.best_reward = np.empty((1, 1)) self.best_reward = np.empty((1, 1))
self.distance_penalty = 100 self.distance_penalty = 0
self.nr_policy_weights = nr_weights self.nr_policy_weights = nr_weights
self.nr_steps = nr_step self.nr_steps = nr_step
@ -40,6 +44,7 @@ class BayesianOptimization:
def initialize(self): def initialize(self):
self.env.reset() self.env.reset()
if self.env.render_mode == 'human':
self.env.render() self.env.render()
self.X = np.zeros((self.nr_init, self.nr_policy_weights)) self.X = np.zeros((self.nr_init, self.nr_policy_weights))
@ -50,10 +55,11 @@ class BayesianOptimization:
self.X[i, :] = self.policy_model.weights.T self.X[i, :] = self.policy_model.weights.T
policy = self.policy_model.policy_rollout() policy = self.policy_model.policy_rollout()
reward = self.runner(policy) reward, step_count = self.runner(policy)
self.Y[i] = reward self.Y[i] = reward
self.gp = GaussianProcessRegressor(Matern(nu=1.5)) self.gp = GaussianProcessRegressor(Matern(nu=1.5))
self.gp.fit(self.X, self.Y) self.gp.fit(self.X, self.Y)
@ -66,6 +72,7 @@ class BayesianOptimization:
output = self.env.step(action) output = self.env.step(action)
env_reward += output[1] env_reward += output[1]
done = output[2] done = output[2]
if self.env.render_mode == 'human':
time.sleep(0.0001) time.sleep(0.0001)
step_count += 1 step_count += 1
if step_count >= self.nr_steps: if step_count >= self.nr_steps:
@ -73,9 +80,21 @@ class BayesianOptimization:
distance = -(self.env.goal_position - output[0][0]) distance = -(self.env.goal_position - output[0][0])
env_reward += distance * self.distance_penalty env_reward += distance * self.distance_penalty
if self.counter_array[0] == 0:
if step_count == 100:
step_count = np.NAN
self.counter_array[0] = step_count
else:
if step_count == 100:
step_count = np.NAN
self.counter_array = np.vstack((self.counter_array, step_count))
if self.env.render_mode == 'human':
time.sleep(0.25) time.sleep(0.25)
self.env.reset() self.env.reset()
return env_reward return env_reward, step_count
def next_observation(self): def next_observation(self):
if self.acq == 'ei': if self.acq == 'ei':
@ -83,6 +102,31 @@ class BayesianOptimization:
self.X, self.X,
self.nr_test, self.nr_test,
self.nr_policy_weights, self.nr_policy_weights,
kappa=0,
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
elif self.acq == 'pi':
x_next = ProbabilityOfImprovement(self.gp,
self.X,
self.nr_test,
self.nr_policy_weights,
kappa=2.576,
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
elif self.acq == 'cb':
x_next = ConfidenceBound(self.gp,
self.X,
self.nr_test,
self.nr_policy_weights,
lam=2.576,
seed=self.policy_seed, seed=self.policy_seed,
lower=self.lowerb, lower=self.lowerb,
upper=self.upperb) upper=self.upperb)
@ -95,7 +139,7 @@ class BayesianOptimization:
def eval_new_observation(self, x_next): def eval_new_observation(self, x_next):
self.policy_model.weights = x_next self.policy_model.weights = x_next
policy = self.policy_model.policy_rollout() policy = self.policy_model.policy_rollout()
reward = self.runner(policy) reward, step_count = self.runner(policy)
self.X = np.vstack((self.X, x_next)) self.X = np.vstack((self.X, x_next))
self.Y = np.vstack((self.Y, reward)) self.Y = np.vstack((self.Y, reward))
@ -108,28 +152,58 @@ class BayesianOptimization:
self.best_reward = np.vstack((self.best_reward, max(self.Y))) self.best_reward = np.vstack((self.best_reward, max(self.Y)))
self.episode += 1 self.episode += 1
self.policy_model.plot_policy() return step_count
def plot_reward(self): def plot_reward(self):
epsiodes = np.linspace(0, self.episode, self.episode) epsiodes = np.linspace(0, self.episode, self.episode)
plt.plot(epsiodes, self.best_reward) plt.plot(epsiodes, self.best_reward)
plt.show() plt.show()
def plot_objective_function(self):
plt.scatter(self.X[:, 0], self.Y)
x_dom = np.linspace(self.lowerb * np.ones((1, 6)), self.upperb * np.ones((1, 6)), 100).reshape(-1, self.nr_policy_weights)
y_plot, y_sigma = self.gp.predict(x_dom, return_std=True)
y_plot = y_plot.reshape(-1)
y_sigma = y_sigma.reshape(-1)
print(y_plot.shape, x_dom.shape)
plt.plot(x_dom[:, 0], y_plot)
plt.fill_between(
x_dom[:, 0],
y_plot - 1.96 * y_sigma,
y_plot + 1.96 * y_sigma,
alpha=0.5
)
plt.show()
def get_best_result(self):
y_hat = self.gp.predict(self.X)
idx = np.argmax(y_hat)
x_max = self.X[idx, :]
self.policy_model.weights = x_max
self.policy_model.policy_rollout()
print(self.counter_array[idx], idx)
self.policy_model.plot_policy(finished=self.counter_array[idx])
def main(): def main():
nr_steps = 100 nr_steps = 100
env = Continuous_MountainCarEnv(render_mode='human') env = Continuous_MountainCarEnv() # render_mode='human'
bo = BayesianOptimization(env, nr_steps) bo = BayesianOptimization(env, nr_steps, acq='cb')
bo.initialize() bo.initialize()
iteration_steps = 200 iteration_steps = 200
for i in range(iteration_steps): for i in range(iteration_steps):
x_next = bo.next_observation() x_next = bo.next_observation()
bo.eval_new_observation(x_next) step_count = bo.eval_new_observation(x_next)
print(bo.episode, bo.best_reward[-1][0]) print(bo.episode, bo.best_reward[-1][0], step_count)
bo.plot_reward() bo.plot_reward()
bo.policy_model.plot_policy() bo.get_best_result()
plt.show()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -35,15 +35,15 @@ class GaussianPolicy:
return self.trajectory return self.trajectory
def plot_policy(self): def plot_policy(self, finished=np.NAN):
x = np.linspace(0, self.nr_steps, self.nr_steps) x = np.linspace(0, self.nr_steps, self.nr_steps)
plt.plot(x, self.trajectory) plt.plot(x, self.trajectory)
if finished != np.NAN:
plt.vlines(finished, -1, 1, colors='red')
# for i in self.mean: # for i in self.mean:
# gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2) # gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
# plt.plot(x, gaussian) # plt.plot(x, gaussian)
plt.show()
def main(): def main():
policy = GaussianPolicy(1, 50) policy = GaussianPolicy(1, 50)