Added BOGymRunner.py
This commit is contained in:
parent
caf76c6f9b
commit
d568523c1a
@ -41,8 +41,15 @@ class BayesianOptimization:
|
|||||||
|
|
||||||
self.nr_test = 100
|
self.nr_test = 100
|
||||||
|
|
||||||
|
def reset_bo(self):
|
||||||
|
self.counter_array = np.zeros((1, 1))
|
||||||
|
self.gp = None
|
||||||
|
self.episode = 0
|
||||||
|
self.best_reward = np.empty((1, 1))
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
|
self.reset_bo()
|
||||||
if self.env.render_mode == 'human':
|
if self.env.render_mode == 'human':
|
||||||
self.env.render()
|
self.env.render()
|
||||||
|
|
||||||
@ -177,14 +184,17 @@ class BayesianOptimization:
|
|||||||
)
|
)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def get_best_result(self):
|
def get_best_result(self, plotter=True):
|
||||||
y_hat = self.gp.predict(self.X)
|
y_hat = self.gp.predict(self.X)
|
||||||
idx = np.argmax(y_hat)
|
idx = np.argmax(y_hat)
|
||||||
x_max = self.X[idx, :]
|
x_max = self.X[idx, :]
|
||||||
self.policy_model.weights = x_max
|
self.policy_model.weights = x_max
|
||||||
self.policy_model.policy_rollout()
|
self.policy_model.policy_rollout()
|
||||||
|
if plotter:
|
||||||
print(self.counter_array[idx], idx)
|
print(self.counter_array[idx], idx)
|
||||||
self.policy_model.plot_policy(finished=self.counter_array[idx])
|
self.policy_model.plot_policy(finished=self.counter_array[idx])
|
||||||
|
else:
|
||||||
|
return self.counter_array[idx]
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nr_steps = 100
|
nr_steps = 100
|
||||||
|
@ -0,0 +1,92 @@
|
|||||||
|
from BayesianOptimization.BOwithGym import BayesianOptimization
|
||||||
|
from ToyTask.MountainCarGym import Continuous_MountainCarEnv
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# BO parameters
|
||||||
|
env = Continuous_MountainCarEnv()
|
||||||
|
nr_steps = 100
|
||||||
|
acquisition_fun = 'ei'
|
||||||
|
iteration_steps = 500
|
||||||
|
|
||||||
|
nr_runs = 20
|
||||||
|
|
||||||
|
# storage arrays
|
||||||
|
finished_store = np.zeros((1, nr_runs))
|
||||||
|
best_policy = np.zeros((nr_steps, nr_runs))
|
||||||
|
reward_store = np.zeros((iteration_steps, nr_runs))
|
||||||
|
|
||||||
|
# post-processing
|
||||||
|
def post_processing(finished, policy, reward):
|
||||||
|
|
||||||
|
finish_mean = np.nanmean(finished)
|
||||||
|
finish_std = np.nanstd(finished)
|
||||||
|
|
||||||
|
policy_mean = np.mean(policy, axis=1)
|
||||||
|
policy_std = np.std(policy, axis=1)
|
||||||
|
|
||||||
|
reward_mean = np.mean(reward, axis=1)
|
||||||
|
reward_std = np.std(reward, axis=1)
|
||||||
|
|
||||||
|
return finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std
|
||||||
|
|
||||||
|
# plot functions
|
||||||
|
def plot_policy(mean, std, fin_mean, fin_std):
|
||||||
|
x = np.linspace(0, mean.shape[0], mean.shape[0])
|
||||||
|
plt.plot(x, mean)
|
||||||
|
plt.fill_between(
|
||||||
|
x,
|
||||||
|
mean - 1.96 * std,
|
||||||
|
mean + 1.96 * std,
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
y = np.linspace(-2, 2, 50)
|
||||||
|
plt.vlines(fin_mean, -2, 2, colors='red')
|
||||||
|
plt.fill_betweenx(
|
||||||
|
y,
|
||||||
|
fin_mean - 1.96 * fin_std,
|
||||||
|
fin_mean + 1.96 * fin_std,
|
||||||
|
alpha=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def plot_reward(mean, std):
|
||||||
|
eps = np.linspace(0, mean.shape[0], mean.shape[0])
|
||||||
|
plt.plot(eps, mean)
|
||||||
|
|
||||||
|
plt.fill_between(
|
||||||
|
eps,
|
||||||
|
mean - 1.96 * std,
|
||||||
|
mean + 1.96 * std,
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# main
|
||||||
|
def main():
|
||||||
|
global finished_store, best_policy, reward_store
|
||||||
|
bo = BayesianOptimization(env, nr_steps, acq=acquisition_fun)
|
||||||
|
for i in range(nr_runs):
|
||||||
|
print('Iteration:', str(i))
|
||||||
|
bo.initialize()
|
||||||
|
for j in range(iteration_steps):
|
||||||
|
x_next = bo.next_observation()
|
||||||
|
bo.eval_new_observation(x_next)
|
||||||
|
|
||||||
|
finished = bo.get_best_result(plotter=False)
|
||||||
|
|
||||||
|
finished_store[:, i] = finished
|
||||||
|
best_policy[:, i] = bo.policy_model.trajectory.T
|
||||||
|
reward_store[:, i] = bo.best_reward.T
|
||||||
|
|
||||||
|
finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std = post_processing(finished_store,
|
||||||
|
best_policy,
|
||||||
|
reward_store)
|
||||||
|
plot_policy(policy_mean, policy_std, finish_mean, finish_std)
|
||||||
|
plot_reward(reward_mean, reward_std)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user