added more Acquisition functions
This commit is contained in:
parent
1f4f878783
commit
78eccba14b
14
AcquistionFunctions/ConfidenceBound.py
Normal file
14
AcquistionFunctions/ConfidenceBound.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
def ConfidenceBound(gp, X, nr_test, nr_weights, lam=1.2, seed=None, lower=-1.0, upper=1.0):
|
||||||
|
y_hat = gp.predict(X)
|
||||||
|
best_y = max(y_hat)
|
||||||
|
rng = np.random.default_rng(seed=seed)
|
||||||
|
X_test = rng.uniform(lower, upper, (nr_test, nr_weights))
|
||||||
|
mu, sigma = gp.predict(X_test, return_std=True)
|
||||||
|
cb = mu + lam * sigma
|
||||||
|
|
||||||
|
idx = np.argmax(cb)
|
||||||
|
X_next = X_test[idx, :]
|
||||||
|
return X_next
|
15
AcquistionFunctions/ProbabilityOfImprovement.py
Normal file
15
AcquistionFunctions/ProbabilityOfImprovement.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
def ProbabilityOfImprovement(gp, X, nr_test, nr_weights, kappa=2.576, seed=None, lower=-1.0, upper=1.0):
|
||||||
|
y_hat = gp.predict(X)
|
||||||
|
best_y = max(y_hat)
|
||||||
|
rng = np.random.default_rng(seed=seed)
|
||||||
|
X_test = rng.uniform(lower, upper, (nr_test, nr_weights))
|
||||||
|
mu, sigma = gp.predict(X_test, return_std=True)
|
||||||
|
z = (mu - best_y - kappa) / sigma
|
||||||
|
pi = norm.cdf(z)
|
||||||
|
|
||||||
|
idx = np.argmax(pi)
|
||||||
|
X_next = X_test[idx, :]
|
||||||
|
return X_next
|
@ -4,6 +4,8 @@ from sklearn.gaussian_process.kernels import Matern
|
|||||||
|
|
||||||
from PolicyModel.GaussianModel import GaussianPolicy
|
from PolicyModel.GaussianModel import GaussianPolicy
|
||||||
from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
|
from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
|
||||||
|
from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
|
||||||
|
from AcquistionFunctions.ConfidenceBound import ConfidenceBound
|
||||||
|
|
||||||
from ToyTask.MountainCarGym import Continuous_MountainCarEnv
|
from ToyTask.MountainCarGym import Continuous_MountainCarEnv
|
||||||
|
|
||||||
@ -12,6 +14,7 @@ import time
|
|||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
class BayesianOptimization:
|
class BayesianOptimization:
|
||||||
def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
|
def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
|
||||||
self.env = env
|
self.env = env
|
||||||
@ -19,11 +22,12 @@ class BayesianOptimization:
|
|||||||
self.acq = acq
|
self.acq = acq
|
||||||
self.X = None
|
self.X = None
|
||||||
self.Y = None
|
self.Y = None
|
||||||
|
self.counter_array = np.zeros((1, 1))
|
||||||
self.gp = None
|
self.gp = None
|
||||||
|
|
||||||
self.episode = 0
|
self.episode = 0
|
||||||
self.best_reward = np.empty((1, 1))
|
self.best_reward = np.empty((1, 1))
|
||||||
self.distance_penalty = 100
|
self.distance_penalty = 0
|
||||||
|
|
||||||
self.nr_policy_weights = nr_weights
|
self.nr_policy_weights = nr_weights
|
||||||
self.nr_steps = nr_step
|
self.nr_steps = nr_step
|
||||||
@ -40,7 +44,8 @@ class BayesianOptimization:
|
|||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
self.env.render()
|
if self.env.render_mode == 'human':
|
||||||
|
self.env.render()
|
||||||
|
|
||||||
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
|
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
|
||||||
self.Y = np.zeros((self.nr_init, 1))
|
self.Y = np.zeros((self.nr_init, 1))
|
||||||
@ -50,10 +55,11 @@ class BayesianOptimization:
|
|||||||
self.X[i, :] = self.policy_model.weights.T
|
self.X[i, :] = self.policy_model.weights.T
|
||||||
policy = self.policy_model.policy_rollout()
|
policy = self.policy_model.policy_rollout()
|
||||||
|
|
||||||
reward = self.runner(policy)
|
reward, step_count = self.runner(policy)
|
||||||
|
|
||||||
self.Y[i] = reward
|
self.Y[i] = reward
|
||||||
|
|
||||||
|
|
||||||
self.gp = GaussianProcessRegressor(Matern(nu=1.5))
|
self.gp = GaussianProcessRegressor(Matern(nu=1.5))
|
||||||
self.gp.fit(self.X, self.Y)
|
self.gp.fit(self.X, self.Y)
|
||||||
|
|
||||||
@ -66,16 +72,29 @@ class BayesianOptimization:
|
|||||||
output = self.env.step(action)
|
output = self.env.step(action)
|
||||||
env_reward += output[1]
|
env_reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
time.sleep(0.0001)
|
if self.env.render_mode == 'human':
|
||||||
|
time.sleep(0.0001)
|
||||||
step_count += 1
|
step_count += 1
|
||||||
if step_count >= self.nr_steps:
|
if step_count >= self.nr_steps:
|
||||||
done = True
|
done = True
|
||||||
distance = -(self.env.goal_position - output[0][0])
|
distance = -(self.env.goal_position - output[0][0])
|
||||||
env_reward += distance * self.distance_penalty
|
env_reward += distance * self.distance_penalty
|
||||||
|
|
||||||
time.sleep(0.25)
|
if self.counter_array[0] == 0:
|
||||||
|
|
||||||
|
if step_count == 100:
|
||||||
|
step_count = np.NAN
|
||||||
|
self.counter_array[0] = step_count
|
||||||
|
|
||||||
|
else:
|
||||||
|
if step_count == 100:
|
||||||
|
step_count = np.NAN
|
||||||
|
self.counter_array = np.vstack((self.counter_array, step_count))
|
||||||
|
|
||||||
|
if self.env.render_mode == 'human':
|
||||||
|
time.sleep(0.25)
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
return env_reward
|
return env_reward, step_count
|
||||||
|
|
||||||
def next_observation(self):
|
def next_observation(self):
|
||||||
if self.acq == 'ei':
|
if self.acq == 'ei':
|
||||||
@ -83,19 +102,44 @@ class BayesianOptimization:
|
|||||||
self.X,
|
self.X,
|
||||||
self.nr_test,
|
self.nr_test,
|
||||||
self.nr_policy_weights,
|
self.nr_policy_weights,
|
||||||
|
kappa=0,
|
||||||
seed=self.policy_seed,
|
seed=self.policy_seed,
|
||||||
lower=self.lowerb,
|
lower=self.lowerb,
|
||||||
upper=self.upperb)
|
upper=self.upperb)
|
||||||
|
|
||||||
return x_next
|
return x_next
|
||||||
|
|
||||||
|
elif self.acq == 'pi':
|
||||||
|
x_next = ProbabilityOfImprovement(self.gp,
|
||||||
|
self.X,
|
||||||
|
self.nr_test,
|
||||||
|
self.nr_policy_weights,
|
||||||
|
kappa=2.576,
|
||||||
|
seed=self.policy_seed,
|
||||||
|
lower=self.lowerb,
|
||||||
|
upper=self.upperb)
|
||||||
|
|
||||||
|
return x_next
|
||||||
|
|
||||||
|
elif self.acq == 'cb':
|
||||||
|
x_next = ConfidenceBound(self.gp,
|
||||||
|
self.X,
|
||||||
|
self.nr_test,
|
||||||
|
self.nr_policy_weights,
|
||||||
|
lam=2.576,
|
||||||
|
seed=self.policy_seed,
|
||||||
|
lower=self.lowerb,
|
||||||
|
upper=self.upperb)
|
||||||
|
|
||||||
|
return x_next
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def eval_new_observation(self, x_next):
|
def eval_new_observation(self, x_next):
|
||||||
self.policy_model.weights = x_next
|
self.policy_model.weights = x_next
|
||||||
policy = self.policy_model.policy_rollout()
|
policy = self.policy_model.policy_rollout()
|
||||||
reward = self.runner(policy)
|
reward, step_count = self.runner(policy)
|
||||||
|
|
||||||
self.X = np.vstack((self.X, x_next))
|
self.X = np.vstack((self.X, x_next))
|
||||||
self.Y = np.vstack((self.Y, reward))
|
self.Y = np.vstack((self.Y, reward))
|
||||||
@ -108,28 +152,58 @@ class BayesianOptimization:
|
|||||||
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
|
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
|
||||||
|
|
||||||
self.episode += 1
|
self.episode += 1
|
||||||
self.policy_model.plot_policy()
|
return step_count
|
||||||
|
|
||||||
def plot_reward(self):
|
def plot_reward(self):
|
||||||
epsiodes = np.linspace(0, self.episode, self.episode)
|
epsiodes = np.linspace(0, self.episode, self.episode)
|
||||||
plt.plot(epsiodes, self.best_reward)
|
plt.plot(epsiodes, self.best_reward)
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
def plot_objective_function(self):
|
||||||
|
plt.scatter(self.X[:, 0], self.Y)
|
||||||
|
|
||||||
|
x_dom = np.linspace(self.lowerb * np.ones((1, 6)), self.upperb * np.ones((1, 6)), 100).reshape(-1, self.nr_policy_weights)
|
||||||
|
y_plot, y_sigma = self.gp.predict(x_dom, return_std=True)
|
||||||
|
|
||||||
|
y_plot = y_plot.reshape(-1)
|
||||||
|
y_sigma = y_sigma.reshape(-1)
|
||||||
|
print(y_plot.shape, x_dom.shape)
|
||||||
|
|
||||||
|
plt.plot(x_dom[:, 0], y_plot)
|
||||||
|
plt.fill_between(
|
||||||
|
x_dom[:, 0],
|
||||||
|
y_plot - 1.96 * y_sigma,
|
||||||
|
y_plot + 1.96 * y_sigma,
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def get_best_result(self):
|
||||||
|
y_hat = self.gp.predict(self.X)
|
||||||
|
idx = np.argmax(y_hat)
|
||||||
|
x_max = self.X[idx, :]
|
||||||
|
self.policy_model.weights = x_max
|
||||||
|
self.policy_model.policy_rollout()
|
||||||
|
print(self.counter_array[idx], idx)
|
||||||
|
self.policy_model.plot_policy(finished=self.counter_array[idx])
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nr_steps = 100
|
nr_steps = 100
|
||||||
env = Continuous_MountainCarEnv(render_mode='human')
|
env = Continuous_MountainCarEnv() # render_mode='human'
|
||||||
bo = BayesianOptimization(env, nr_steps)
|
bo = BayesianOptimization(env, nr_steps, acq='cb')
|
||||||
bo.initialize()
|
bo.initialize()
|
||||||
iteration_steps = 200
|
iteration_steps = 200
|
||||||
for i in range(iteration_steps):
|
for i in range(iteration_steps):
|
||||||
x_next = bo.next_observation()
|
x_next = bo.next_observation()
|
||||||
bo.eval_new_observation(x_next)
|
step_count = bo.eval_new_observation(x_next)
|
||||||
|
|
||||||
print(bo.episode, bo.best_reward[-1][0])
|
print(bo.episode, bo.best_reward[-1][0], step_count)
|
||||||
|
|
||||||
bo.plot_reward()
|
bo.plot_reward()
|
||||||
bo.policy_model.plot_policy()
|
bo.get_best_result()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -35,15 +35,15 @@ class GaussianPolicy:
|
|||||||
|
|
||||||
return self.trajectory
|
return self.trajectory
|
||||||
|
|
||||||
def plot_policy(self):
|
def plot_policy(self, finished=np.NAN):
|
||||||
x = np.linspace(0, self.nr_steps, self.nr_steps)
|
x = np.linspace(0, self.nr_steps, self.nr_steps)
|
||||||
plt.plot(x, self.trajectory)
|
plt.plot(x, self.trajectory)
|
||||||
|
if finished != np.NAN:
|
||||||
|
plt.vlines(finished, -1, 1, colors='red')
|
||||||
# for i in self.mean:
|
# for i in self.mean:
|
||||||
# gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
|
# gaussian = np.exp(-0.5 * (x - i)**2 / self.std**2)
|
||||||
# plt.plot(x, gaussian)
|
# plt.plot(x, gaussian)
|
||||||
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
policy = GaussianPolicy(1, 50)
|
policy = GaussianPolicy(1, 50)
|
||||||
|
Loading…
Reference in New Issue
Block a user