ActiveBOToytask/runner/BOGymRunner.py

109 lines
2.8 KiB
Python
Raw Normal View History

2023-02-15 15:03:03 +00:00
from BayesianOptimization.BOwithGym import BayesianOptimization
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
import numpy as np
import matplotlib.pyplot as plt
2023-04-27 14:38:24 +00:00
# from ToyTask.MountainCarGym import Continuous_MountainCarEnv
from ToyTask.Pendulum import PendulumEnv
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
2023-02-15 15:03:03 +00:00
# BO parameters
2023-04-27 14:38:24 +00:00
env = PendulumEnv()
2023-02-15 15:03:03 +00:00
nr_steps = 100
acquisition_fun = 'ei'
2023-04-07 10:22:46 +00:00
iteration_steps = 100
2023-02-15 15:03:03 +00:00
2023-02-17 12:29:08 +00:00
nr_runs = 100
2023-02-15 15:03:03 +00:00
# storage arrays
finished_store = np.zeros((1, nr_runs))
best_policy = np.zeros((nr_steps, nr_runs))
reward_store = np.zeros((iteration_steps, nr_runs))
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
# post-processing
def post_processing(finished, policy, reward):
finish_mean = np.nanmean(finished)
finish_std = np.nanstd(finished)
policy_mean = np.mean(policy, axis=1)
policy_std = np.std(policy, axis=1)
reward_mean = np.mean(reward, axis=1)
reward_std = np.std(reward, axis=1)
return finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
# plot functions
def plot_policy(mean, std, fin_mean, fin_std):
x = np.linspace(0, mean.shape[0], mean.shape[0])
plt.plot(x, mean)
plt.fill_between(
x,
mean - 1.96 * std,
mean + 1.96 * std,
alpha=0.5
)
y = np.linspace(-2, 2, 50)
plt.vlines(fin_mean, -2, 2, colors='red')
plt.fill_betweenx(
y,
fin_mean - 1.96 * fin_std,
fin_mean + 1.96 * fin_std,
alpha=0.5,
)
plt.show()
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
def plot_reward(mean, std):
eps = np.linspace(0, mean.shape[0], mean.shape[0])
plt.plot(eps, mean)
plt.fill_between(
eps,
mean - 1.96 * std,
mean + 1.96 * std,
alpha=0.5
)
plt.show()
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
# main
def main():
global finished_store, best_policy, reward_store
bo = BayesianOptimization(env, nr_steps, acq=acquisition_fun)
for i in range(nr_runs):
print('Iteration:', str(i))
2023-04-27 14:38:24 +00:00
bo.env_seed = int(np.random.randint(1, 2147483647, 1)[0])
2023-02-15 15:03:03 +00:00
bo.initialize()
for j in range(iteration_steps):
x_next = bo.next_observation()
bo.eval_new_observation(x_next)
finished = bo.get_best_result(plotter=False)
finished_store[:, i] = finished
best_policy[:, i] = bo.policy_model.trajectory.T
reward_store[:, i] = bo.best_reward.T
2023-04-27 14:38:24 +00:00
print(reward_store[-1, i])
2023-02-15 15:03:03 +00:00
finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std = post_processing(finished_store,
best_policy,
reward_store)
plot_policy(policy_mean, policy_std, finish_mean, finish_std)
plot_reward(reward_mean, reward_std)
2023-04-27 14:38:24 +00:00
2023-02-15 15:03:03 +00:00
if __name__ == '__main__':
2023-04-27 14:38:24 +00:00
main()