2023-02-02 17:54:09 +00:00
|
|
|
import numpy as np
|
|
|
|
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
|
from sklearn.gaussian_process.kernels import Matern
|
|
|
|
|
|
|
|
from PolicyModel.GaussianModel import GaussianPolicy
|
|
|
|
from ToyTask.MountainCar import MountainCarEnv
|
|
|
|
from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from warnings import catch_warnings, simplefilter
|
|
|
|
|
2023-04-19 15:01:16 +00:00
|
|
|
|
2023-02-02 17:54:09 +00:00
|
|
|
class BayesianOptimization:
|
|
|
|
def __init__(self, env, nr_init=3, acq='ei', nr_weights=8, policy_seed=None):
|
|
|
|
self.env = env
|
|
|
|
self.nr_init = nr_init
|
|
|
|
self.acq = acq
|
|
|
|
self.X = None
|
|
|
|
self.Y = None
|
|
|
|
self.gp = None
|
|
|
|
|
|
|
|
self.episode = 0
|
|
|
|
self.best_reward = np.empty((1, 1))
|
|
|
|
|
|
|
|
self.nr_policy_weights = nr_weights
|
|
|
|
self.nr_steps = env.max_step
|
|
|
|
self.policy_seed = policy_seed
|
|
|
|
self.lowerb = -1.0
|
|
|
|
self.upperb = 1.0
|
|
|
|
self.policy_model = GaussianPolicy(self.nr_policy_weights,
|
|
|
|
self.nr_steps,
|
|
|
|
self.policy_seed,
|
|
|
|
self.lowerb,
|
|
|
|
self.upperb)
|
|
|
|
|
|
|
|
self.nr_test = 100
|
|
|
|
|
|
|
|
def initialize(self):
|
|
|
|
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
|
|
|
|
self.Y = np.zeros((self.nr_init, 1))
|
|
|
|
|
|
|
|
for i in range(self.nr_init):
|
|
|
|
self.policy_model.random_policy()
|
|
|
|
self.X[i, :] = self.policy_model.weights.T
|
|
|
|
policy = self.policy_model.policy_rollout()
|
|
|
|
|
|
|
|
reward, step_count = self.env.runner(policy)
|
|
|
|
|
|
|
|
self.Y[i] = reward
|
|
|
|
|
|
|
|
self.gp = GaussianProcessRegressor(Matern(nu=1.5))
|
|
|
|
self.gp.fit(self.X, self.Y)
|
|
|
|
|
|
|
|
def next_observation(self):
|
|
|
|
if self.acq == 'ei':
|
|
|
|
x_next = ExpectedImprovement(self.gp,
|
|
|
|
self.X,
|
|
|
|
self.nr_test,
|
|
|
|
self.nr_policy_weights,
|
|
|
|
seed=self.policy_seed,
|
|
|
|
lower=self.lowerb,
|
|
|
|
upper=self.upperb)
|
|
|
|
|
|
|
|
return x_next
|
|
|
|
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def eval_new_observation(self, x_next):
|
|
|
|
self.policy_model.weights = x_next
|
|
|
|
policy = self.policy_model.policy_rollout()
|
|
|
|
reward, _ = self.env.runner(policy)
|
|
|
|
|
|
|
|
self.X = np.vstack((self.X, x_next))
|
|
|
|
self.Y = np.vstack((self.Y, reward))
|
|
|
|
|
|
|
|
self.gp.fit(self.X, self.Y)
|
|
|
|
|
|
|
|
if self.episode == 0:
|
|
|
|
self.best_reward[0] = max(self.Y)
|
|
|
|
else:
|
|
|
|
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
|
|
|
|
|
|
|
|
self.episode += 1
|
|
|
|
|
|
|
|
def plot(self):
|
|
|
|
if self.nr_policy_weights == 1:
|
|
|
|
plt.scatter(self.X, self.Y)
|
|
|
|
|
|
|
|
x_plot = np.linspace(self.lowerb, self.upperb, 100).reshape(-1, 1)
|
|
|
|
with catch_warnings():
|
|
|
|
simplefilter('ignore')
|
|
|
|
y_plot, y_sigma = self.gp.predict(x_plot, return_std=True)
|
|
|
|
|
|
|
|
plt.plot(x_plot, y_plot)
|
|
|
|
plt.fill_between(
|
|
|
|
x_plot.ravel(),
|
|
|
|
y_plot - 1.96 * y_sigma,
|
|
|
|
y_plot + 1.96 * y_sigma,
|
|
|
|
alpha=0.5
|
|
|
|
)
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
def plot_reward(self):
|
|
|
|
epsiodes = np.linspace(0, self.episode, self.episode)
|
|
|
|
plt.plot(epsiodes, self.best_reward)
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
max_step = 50
|
|
|
|
car = MountainCarEnv(max_step)
|
|
|
|
bo = BayesianOptimization(car)
|
|
|
|
bo.initialize()
|
|
|
|
iteration_steps = 100
|
|
|
|
for i in range(iteration_steps):
|
|
|
|
x_next = bo.next_observation()
|
|
|
|
bo.eval_new_observation(x_next)
|
|
|
|
|
|
|
|
bo.plot_reward()
|
|
|
|
bo.policy_model.plot_policy()
|
|
|
|
|
2023-08-17 16:26:23 +00:00
|
|
|
|
2023-02-02 17:54:09 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|