ActiveBOToytask/BayesianOptimization/BOwithGym.py

245 lines
8.3 KiB
Python
Raw Permalink Normal View History

2023-02-02 17:54:09 +00:00
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from PolicyModel.GaussianModel import GaussianPolicy
from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement
2023-02-06 14:43:30 +00:00
from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovement
from AcquistionFunctions.ConfidenceBound import ConfidenceBound
2023-07-05 11:24:03 +00:00
from AcquistionFunctions.ExpectedImprovementwithPreference import expected_improvement
2023-02-02 17:54:09 +00:00
2023-02-03 13:05:03 +00:00
from ToyTask.MountainCarGym import Continuous_MountainCarEnv
2023-04-27 14:38:24 +00:00
from ToyTask.Pendulum import PendulumEnv
2023-02-03 13:05:03 +00:00
2023-02-02 17:54:09 +00:00
import time
import matplotlib.pyplot as plt
2023-02-06 14:43:30 +00:00
2023-02-02 17:54:09 +00:00
class BayesianOptimization:
2023-04-27 14:38:24 +00:00
def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None, env_seed=None):
2023-02-02 17:54:09 +00:00
self.env = env
2023-04-27 14:38:24 +00:00
self.env_seed = env_seed
2023-02-02 17:54:09 +00:00
self.nr_init = nr_init
self.acq = acq
self.X = None
self.Y = None
2023-02-06 14:43:30 +00:00
self.counter_array = np.zeros((1, 1))
2023-02-02 17:54:09 +00:00
self.gp = None
self.episode = 0
self.best_reward = np.empty((1, 1))
2023-02-06 14:43:30 +00:00
self.distance_penalty = 0
2023-02-02 17:54:09 +00:00
self.nr_policy_weights = nr_weights
self.nr_steps = nr_step
self.policy_seed = policy_seed
self.lowerb = -1.0
self.upperb = 1.0
self.policy_model = GaussianPolicy(self.nr_policy_weights,
self.nr_steps,
self.policy_seed,
self.lowerb,
self.upperb)
self.nr_test = 100
2023-07-05 11:24:03 +00:00
self.mean_pref = np.zeros((self.nr_policy_weights, 1))
self.var_pref = np.ones(self.mean_pref.shape) * 0.75
self.use_gaussian = np.array([False] * self.nr_policy_weights)
2023-02-02 17:54:09 +00:00
2023-02-15 15:03:03 +00:00
def reset_bo(self):
self.counter_array = np.zeros((1, 1))
self.gp = None
self.episode = 0
self.best_reward = np.empty((1, 1))
2023-02-02 17:54:09 +00:00
def initialize(self):
2023-04-27 14:38:24 +00:00
self.env.reset(seed=self.env_seed)
2023-02-15 15:03:03 +00:00
self.reset_bo()
2023-02-06 14:43:30 +00:00
if self.env.render_mode == 'human':
self.env.render()
2023-02-02 17:54:09 +00:00
self.X = np.zeros((self.nr_init, self.nr_policy_weights))
self.Y = np.zeros((self.nr_init, 1))
for i in range(self.nr_init):
self.policy_model.random_policy()
self.X[i, :] = self.policy_model.weights.T
policy = self.policy_model.policy_rollout()
2023-02-06 14:43:30 +00:00
reward, step_count = self.runner(policy)
2023-02-02 17:54:09 +00:00
self.Y[i] = reward
self.gp = GaussianProcessRegressor(Matern(nu=1.5))
self.gp.fit(self.X, self.Y)
def runner(self, policy):
2023-04-27 14:38:24 +00:00
self.env.reset(seed=self.env_seed)
2023-02-02 17:54:09 +00:00
done = False
step_count = 0
env_reward = 0.0
while not done:
action = policy[step_count]
2023-04-27 14:38:24 +00:00
action_clipped = action.clip(min=-1.0, max=1.0)
output = self.env.step(action_clipped.astype(np.float32))
2023-02-02 17:54:09 +00:00
env_reward += output[1]
done = output[2]
2023-02-06 14:43:30 +00:00
if self.env.render_mode == 'human':
time.sleep(0.0001)
2023-02-02 17:54:09 +00:00
step_count += 1
if step_count >= self.nr_steps:
done = True
2023-02-06 14:43:30 +00:00
if self.counter_array[0] == 0:
if step_count == 100:
step_count = np.NAN
self.counter_array[0] = step_count
else:
if step_count == 100:
step_count = np.NAN
self.counter_array = np.vstack((self.counter_array, step_count))
if self.env.render_mode == 'human':
time.sleep(0.25)
return env_reward, step_count
2023-02-02 17:54:09 +00:00
def next_observation(self):
if self.acq == 'ei':
x_next = ExpectedImprovement(self.gp,
self.X,
self.nr_test,
self.nr_policy_weights,
2023-02-06 14:43:30 +00:00
kappa=0,
2023-02-02 17:54:09 +00:00
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
2023-02-06 14:43:30 +00:00
elif self.acq == 'pi':
x_next = ProbabilityOfImprovement(self.gp,
self.X,
self.nr_test,
self.nr_policy_weights,
2023-02-13 12:15:08 +00:00
kappa=0,
2023-02-06 14:43:30 +00:00
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
elif self.acq == 'cb':
x_next = ConfidenceBound(self.gp,
self.X,
self.nr_test,
self.nr_policy_weights,
lam=2.576,
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
2023-07-05 11:24:03 +00:00
elif self.acq == 'ei_gauss':
if self.episode >= 25:
self.use_gaussian = np.array([True] * self.nr_policy_weights)
best_5_idx = np.argsort(self.Y, axis=0)[:5].flatten()
self.mean_pref = np.mean(self.X[best_5_idx], axis=0)
self.var_pref *= 0.95
x_next = expected_improvement(self.gp,
self.X,
self.nr_test,
self.use_gaussian,
self.mean_pref,
self.var_pref,
kappa=0,
seed=self.policy_seed,
lower=self.lowerb,
upper=self.upperb)
return x_next
2023-02-02 17:54:09 +00:00
else:
raise NotImplementedError
def eval_new_observation(self, x_next):
self.policy_model.weights = x_next
policy = self.policy_model.policy_rollout()
2023-02-06 14:43:30 +00:00
reward, step_count = self.runner(policy)
2023-02-02 17:54:09 +00:00
self.X = np.vstack((self.X, x_next))
self.Y = np.vstack((self.Y, reward))
self.gp.fit(self.X, self.Y)
if self.episode == 0:
self.best_reward[0] = max(self.Y)
else:
self.best_reward = np.vstack((self.best_reward, max(self.Y)))
self.episode += 1
2023-02-06 14:43:30 +00:00
return step_count
2023-02-02 17:54:09 +00:00
def plot_reward(self):
epsiodes = np.linspace(0, self.episode, self.episode)
plt.plot(epsiodes, self.best_reward)
2023-02-06 14:43:30 +00:00
2023-02-02 17:54:09 +00:00
plt.show()
2023-02-06 14:43:30 +00:00
def plot_objective_function(self):
plt.scatter(self.X[:, 0], self.Y)
2023-07-05 11:24:03 +00:00
x_dom = np.linspace(self.lowerb * np.ones((1, 6)), self.upperb * np.ones((1, 6)), 100).reshape(-1,
self.nr_policy_weights)
2023-02-06 14:43:30 +00:00
y_plot, y_sigma = self.gp.predict(x_dom, return_std=True)
y_plot = y_plot.reshape(-1)
y_sigma = y_sigma.reshape(-1)
print(y_plot.shape, x_dom.shape)
plt.plot(x_dom[:, 0], y_plot)
plt.fill_between(
x_dom[:, 0],
y_plot - 1.96 * y_sigma,
y_plot + 1.96 * y_sigma,
alpha=0.5
)
plt.show()
2023-02-15 15:03:03 +00:00
def get_best_result(self, plotter=True):
2023-02-06 14:43:30 +00:00
y_hat = self.gp.predict(self.X)
idx = np.argmax(y_hat)
x_max = self.X[idx, :]
self.policy_model.weights = x_max
self.policy_model.policy_rollout()
2023-02-15 15:03:03 +00:00
if plotter:
self.policy_model.plot_policy(finished=self.counter_array[idx])
else:
return self.counter_array[idx]
2023-02-02 17:54:09 +00:00
2023-04-27 14:38:24 +00:00
2023-02-02 17:54:09 +00:00
def main():
2023-02-03 13:05:03 +00:00
nr_steps = 100
2023-07-05 11:24:03 +00:00
env = Continuous_MountainCarEnv(render_mode='human') # render_mode='human'
bo = BayesianOptimization(env, nr_steps, nr_weights=10, acq='ei_gauss', env_seed=1234)
2023-02-02 17:54:09 +00:00
bo.initialize()
2023-04-27 14:38:24 +00:00
iteration_steps = 100
2023-02-02 17:54:09 +00:00
for i in range(iteration_steps):
x_next = bo.next_observation()
2023-02-06 14:43:30 +00:00
step_count = bo.eval_new_observation(x_next)
2023-02-02 17:54:09 +00:00
2023-02-06 14:43:30 +00:00
print(bo.episode, bo.best_reward[-1][0], step_count)
2023-02-02 17:54:09 +00:00
bo.plot_reward()
2023-02-06 14:43:30 +00:00
bo.get_best_result()
plt.show()
2023-02-02 17:54:09 +00:00
if __name__ == "__main__":
main()