added more gym envs

2023-04-27 16:38:24 +02:00 · 2023-04-27 16:38:24 +02:00 · 69ae81d82d
commit 69ae81d82d
parent 4fe3973a53
5 changed files with 612 additions and 14 deletions
--- a/BayesianOptimization/BOwithGym.py
+++ b/BayesianOptimization/BOwithGym.py
@ -8,6 +8,7 @@ from AcquistionFunctions.ProbabilityOfImprovement import ProbabilityOfImprovemen
 from AcquistionFunctions.ConfidenceBound import ConfidenceBound
 from ToyTask.MountainCarGym import Continuous_MountainCarEnv
 from ToyTask.Pendulum import PendulumEnv
 import time
@ -15,8 +16,9 @@ import matplotlib.pyplot as plt
 class BayesianOptimization:
-    def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None):
+    def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None, env_seed=None):
        self.env = env
        self.env_seed = env_seed
        self.nr_init = nr_init
        self.acq = acq
        self.X = None
@ -48,7 +50,7 @@ class BayesianOptimization:
        self.best_reward = np.empty((1, 1))
    def initialize(self):
-        self.env.reset()
+        self.env.reset(seed=self.env_seed)
        self.reset_bo()
        if self.env.render_mode == 'human':
            self.env.render()
@ -69,12 +71,14 @@ class BayesianOptimization:
        self.gp.fit(self.X, self.Y)
    def runner(self, policy):
        self.env.reset(seed=self.env_seed)
        done = False
        step_count = 0
        env_reward = 0.0
        while not done:
            action = policy[step_count]
-            output = self.env.step(action)
+            action_clipped = action.clip(min=-1.0, max=1.0)
            output = self.env.step(action_clipped.astype(np.float32))
            env_reward += output[1]
            done = output[2]
            if self.env.render_mode == 'human':
@ -82,8 +86,6 @@ class BayesianOptimization:
            step_count += 1
            if step_count >= self.nr_steps:
                done = True
                distance = -(self.env.goal_position - output[0][0])
                env_reward += distance * self.distance_penalty
        if self.counter_array[0] == 0:
@ -98,7 +100,6 @@ class BayesianOptimization:
        if self.env.render_mode == 'human':
            time.sleep(0.25)
        self.env.reset()
        return env_reward, step_count
    def next_observation(self):
@ -187,22 +188,21 @@ class BayesianOptimization:
    def get_best_result(self, plotter=True):
        y_hat = self.gp.predict(self.X)
        idx = np.argmax(y_hat)
        print(idx, np.argmax(self.Y))
        x_max = self.X[idx, :]
        self.policy_model.weights = x_max
        self.policy_model.policy_rollout()
        if plotter:
            print(self.counter_array[idx], idx)
            self.policy_model.plot_policy(finished=self.counter_array[idx])
        else:
            return self.counter_array[idx]
 def main():
    nr_steps = 100
-    env = Continuous_MountainCarEnv()   # render_mode='human'
+    env = PendulumEnv(render_mode='human')   # render_mode='human'
-    bo = BayesianOptimization(env, nr_steps, nr_weights=10, acq='ei')
+    bo = BayesianOptimization(env, nr_steps, nr_weights=10, acq='ei', env_seed=1234)
    bo.initialize()
-    iteration_steps = 200
+    iteration_steps = 100
    for i in range(iteration_steps):
        x_next = bo.next_observation()
        step_count = bo.eval_new_observation(x_next)
--- a/ToyTask/Cartpole.py
+++ b/ToyTask/Cartpole.py
@ -0,0 +1,309 @@
 """
 Classic cart-pole system implemented by Rich Sutton et al.
 Copied from http://incompleteideas.net/sutton/book/code/pole.c
 permalink: https://perma.cc/C9ZM-652R
 """
 import math
 from typing import Optional, Union
 import numpy as np
 import gym
 from gym import logger, spaces
 from gym.spaces import Box
 from gym.envs.classic_control import utils
 from gym.error import DependencyNotInstalled
 class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
    """
    ### Description
    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
     in the left and right direction on the cart.
    ### Action Space
    Due to the policy shaping approach the action is a 'ndarray' with shape '(1,)' which can take values '[-1,1]' which
    is scaled by the force_mag pushing the cart to the left if it is lower than 0, to the right if it is higher than 0
    and doing nothing if the action is equal to 0
    ### Observation Space
    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:
    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |
    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)
    ### Rewards
    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
    including the termination step, is allotted. The threshold for rewards is 475 for v1.
    ### Starting State
    All observations are assigned a uniformly random value in `(-0.05, 0.05)`
    ### Episode End
    The episode ends if any one of the following occurs:
    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)
    ### Arguments
    ```
    gym.make('CartPole-v1')
    ```
    No additional arguments are currently supported.
    """
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 50,
    }
    def __init__(self, render_mode: Optional[str] = None):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = self.masspole + self.masscart
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = self.masspole * self.length
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self.kinematics_integrator = "euler"
        # Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4
        # Angle limit set to 2 * theta_threshold_radians so failing observation
        # is still within bounds.
        high = np.array(
            [
                self.x_threshold * 2,
                np.finfo(np.float32).max,
                self.theta_threshold_radians * 2,
                np.finfo(np.float32).max,
            ],
            dtype=np.float32,
        )
        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
        self.render_mode = render_mode
        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None
        self.steps_beyond_terminated = None
    def step(self, action):
        err_msg = f"{action!r} ({type(action)}) invalid"
        assert self.action_space.contains(action), err_msg
        assert self.state is not None, "Call reset before using step method."
        x, x_dot, theta, theta_dot = self.state
        # changed usage of action due to policy shaping approach
        force = action * self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        # For the interested reader:
        # https://coneural.org/florian/papers/05_cart_pole.pdf
        temp = (
            force + self.polemass_length * theta_dot**2 * sintheta
        ) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)
        )
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        if self.kinematics_integrator == "euler":
            x = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else:  # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot
        self.state = (x, x_dot[0], theta, theta_dot[0])
        terminated = bool(
            x < -self.x_threshold
            or x > self.x_threshold
            or theta < -self.theta_threshold_radians
            or theta > self.theta_threshold_radians
        )
        if not terminated:
            reward = 1.0
        elif self.steps_beyond_terminated is None:
            # Pole just fell!
            self.steps_beyond_terminated = 0
            reward = 1.0
        else:
            if self.steps_beyond_terminated == 0:
                logger.warn(
                    "You are calling 'step()' even though this "
                    "environment has already returned terminated = True. You "
                    "should always call 'reset()' once you receive 'terminated = "
                    "True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_terminated += 1
            reward = 0.0
        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        # Note that if you use custom reset bounds, it may lead to out-of-bound
        # state/observations.
        low, high = utils.maybe_parse_reset_bounds(
            options, -0.05, 0.05  # default low
        )  # default high
        self.state = self.np_random.uniform(low=low, high=high, size=(4,))
        self.steps_beyond_terminated = None
        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), {}
    def render(self):
        if self.render_mode is None:
            gym.logger.warn(
                "You are calling render method without specifying any render mode. "
                "You can specify the render_mode at initialization, "
                f'e.g. gym("{self.spec.id}", render_mode="rgb_array")'
            )
            return
        try:
            import pygame
            from pygame import gfxdraw
        except ImportError:
            raise DependencyNotInstalled(
                "pygame is not installed, run `pip install gym[classic_control]`"
            )
        if self.screen is None:
            pygame.init()
            if self.render_mode == "human":
                pygame.display.init()
                self.screen = pygame.display.set_mode(
                    (self.screen_width, self.screen_height)
                )
            else:  # mode == "rgb_array"
                self.screen = pygame.Surface((self.screen_width, self.screen_height))
        if self.clock is None:
            self.clock = pygame.time.Clock()
        world_width = self.x_threshold * 2
        scale = self.screen_width / world_width
        polewidth = 10.0
        polelen = scale * (2 * self.length)
        cartwidth = 50.0
        cartheight = 30.0
        if self.state is None:
            return None
        x = self.state
        self.surf = pygame.Surface((self.screen_width, self.screen_height))
        self.surf.fill((255, 255, 255))
        l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
        axleoffset = cartheight / 4.0
        cartx = x[0] * scale + self.screen_width / 2.0  # MIDDLE OF CART
        carty = 100  # TOP OF CART
        cart_coords = [(l, b), (l, t), (r, t), (r, b)]
        cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords]
        gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0))
        gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0))
        l, r, t, b = (
            -polewidth / 2,
            polewidth / 2,
            polelen - polewidth / 2,
            -polewidth / 2,
        )
        pole_coords = []
        for coord in [(l, b), (l, t), (r, t), (r, b)]:
            coord = pygame.math.Vector2(coord).rotate_rad(-x[2])
            coord = (coord[0] + cartx, coord[1] + carty + axleoffset)
            pole_coords.append(coord)
        gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101))
        gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101))
        gfxdraw.aacircle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )
        gfxdraw.filled_circle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )
        gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0))
        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
        if self.render_mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            pygame.display.flip()
        elif self.render_mode == "rgb_array":
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
            )
    def close(self):
        if self.screen is not None:
            import pygame
            pygame.display.quit()
            pygame.quit()
            self.isopen = False
--- a/ToyTask/Pendulum.py
+++ b/ToyTask/Pendulum.py
@ -0,0 +1,273 @@
 __credits__ = ["Carlos Luis"]
 from os import path
 from typing import Optional
 import numpy as np
 import gym
 from gym import spaces
 from gym.envs.classic_control import utils
 from gym.error import DependencyNotInstalled
 DEFAULT_X = np.pi
 DEFAULT_Y = 1.0
 class PendulumEnv(gym.Env):
    """
       ### Description
    The inverted pendulum swingup problem is based on the classic problem in control theory.
    The system consists of a pendulum attached at one end to a fixed point, and the other end being free.
    The pendulum starts in a random position and the goal is to apply torque on the free end to swing it
    into an upright position, with its center of gravity right above the fixed point.
    The diagram below specifies the coordinate system used for the implementation of the pendulum's
    dynamic equations.
    ![Pendulum Coordinate System](./diagrams/pendulum.png)
    -  `x-y`: cartesian coordinates of the pendulum's end in meters.
    - `theta` : angle in radians.
    - `tau`: torque in `N m`. Defined as positive _counter-clockwise_.
    ### Action Space
    The action is a `ndarray` with shape `(1,)` representing the torque applied to free end of the pendulum.
    | Num | Action | Min  | Max |
    |-----|--------|------|-----|
    | 0   | Torque | -1.0 | 1.0 |
    ### Observation Space
    The observation is a `ndarray` with shape `(3,)` representing the x-y coordinates of the pendulum's free
    end and its angular velocity.
    | Num | Observation      | Min  | Max |
    |-----|------------------|------|-----|
    | 0   | x = cos(theta)   | -1.0 | 1.0 |
    | 1   | y = sin(theta)   | -1.0 | 1.0 |
    | 2   | Angular Velocity | -8.0 | 8.0 |
    ### Rewards
    The reward function is defined as:
    *r = -(theta<sup>2</sup> + 0.1 * theta_dt<sup>2</sup> + 0.001 * torque<sup>2</sup>)*
    where `$\theta$` is the pendulum's angle normalized between *[-pi, pi]* (with 0 being in the upright position).
    Based on the above equation, the minimum reward that can be obtained is
    *-(pi<sup>2</sup> + 0.1 * 8<sup>2</sup> + 0.001 * 2<sup>2</sup>) = -16.2736044*,
    while the maximum reward is zero (pendulum is upright with zero velocity and no torque applied).
    ### Starting State
    The starting state is a random angle in *[-pi, pi]* and a random angular velocity in *[-1,1]*.
    ### Episode Truncation
    The episode truncates at 200 time steps.
    ### Arguments
    - `g`: acceleration of gravity measured in *(m s<sup>-2</sup>)* used to calculate the pendulum dynamics.
      The default value is g = 10.0 .
    ```
    gym.make('Pendulum-v1', g=9.81)
    ```
    ### Version History
    * v1: Simplify the math equations, no difference in behavior.
    * v0: Initial versions release (1.0.0)
    """
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 30,
    }
    def __init__(self, render_mode: Optional[str] = None, g=10.0):
        self.max_speed = 8
        self.max_torque = 2.0
        self.dt = 0.05
        self.g = g
        self.m = 1.0
        self.l = 1.0
        self.render_mode = render_mode
        self.screen_dim = 500
        self.screen = None
        self.clock = None
        self.isopen = True
        high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32)
        # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric
        #   or normalised as max_torque == 2 by default. Ignoring the issue here as the default settings are too old
        #   to update to follow the openai gym api
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
    def step(self, u):
        th, thdot = self.state  # th := theta
        g = self.g
        m = self.m
        l = self.l
        dt = self.dt
        u = 2 * u # scaling the action to +/- 2 Nm
        u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
        newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
        newth = th + newthdot * dt
        self.state = np.array([newth, newthdot])
        if self.render_mode == "human":
            self.render()
        return self._get_obs(), -costs, False, False, {}
    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        if options is None:
            high = np.array([DEFAULT_X, DEFAULT_Y])
        else:
            # Note that if you use custom reset bounds, it may lead to out-of-bound
            # state/observations.
            x = options.get("x_init") if "x_init" in options else DEFAULT_X
            y = options.get("y_init") if "y_init" in options else DEFAULT_Y
            x = utils.verify_number_and_cast(x)
            y = utils.verify_number_and_cast(y)
            high = np.array([x, y])
        low = -high  # We enforce symmetric limits.
        self.state = self.np_random.uniform(low=low, high=high)
        self.last_u = None
        if self.render_mode == "human":
            self.render()
        return self._get_obs(), {}
    def _get_obs(self):
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)
    def render(self):
        if self.render_mode is None:
            gym.logger.warn(
                "You are calling render method without specifying any render mode. "
                "You can specify the render_mode at initialization, "
                f'e.g. gym("{self.spec.id}", render_mode="rgb_array")'
            )
            return
        try:
            import pygame
            from pygame import gfxdraw
        except ImportError:
            raise DependencyNotInstalled(
                "pygame is not installed, run `pip install gym[classic_control]`"
            )
        if self.screen is None:
            pygame.init()
            if self.render_mode == "human":
                pygame.display.init()
                self.screen = pygame.display.set_mode(
                    (self.screen_dim, self.screen_dim)
                )
            else:  # mode in "rgb_array"
                self.screen = pygame.Surface((self.screen_dim, self.screen_dim))
        if self.clock is None:
            self.clock = pygame.time.Clock()
        self.surf = pygame.Surface((self.screen_dim, self.screen_dim))
        self.surf.fill((255, 255, 255))
        bound = 2.2
        scale = self.screen_dim / (bound * 2)
        offset = self.screen_dim // 2
        rod_length = 1 * scale
        rod_width = 0.2 * scale
        l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2
        coords = [(l, b), (l, t), (r, t), (r, b)]
        transformed_coords = []
        for c in coords:
            c = pygame.math.Vector2(c).rotate_rad(self.state[0] + np.pi / 2)
            c = (c[0] + offset, c[1] + offset)
            transformed_coords.append(c)
        gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77))
        gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77))
        gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77))
        gfxdraw.filled_circle(
            self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)
        )
        rod_end = (rod_length, 0)
        rod_end = pygame.math.Vector2(rod_end).rotate_rad(self.state[0] + np.pi / 2)
        rod_end = (int(rod_end[0] + offset), int(rod_end[1] + offset))
        gfxdraw.aacircle(
            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
        )
        gfxdraw.filled_circle(
            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
        )
        fname = path.join(path.dirname(__file__), "assets/clockwise.png")
        img = pygame.image.load(fname)
        if self.last_u is not None:
            scale_img = pygame.transform.smoothscale(
                img,
                (scale * np.abs(self.last_u) / 2, scale * np.abs(self.last_u) / 2),
            )
            is_flip = bool(self.last_u > 0)
            scale_img = pygame.transform.flip(scale_img, is_flip, True)
            self.surf.blit(
                scale_img,
                (
                    offset - scale_img.get_rect().centerx,
                    offset - scale_img.get_rect().centery,
                ),
            )
        # drawing axle
        gfxdraw.aacircle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
        gfxdraw.filled_circle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
        if self.render_mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            pygame.display.flip()
        else:  # mode == "rgb_array":
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
            )
    def close(self):
        if self.screen is not None:
            import pygame
            pygame.display.quit()
            pygame.quit()
            self.isopen = False
 def angle_normalize(x):
    return ((x + np.pi) % (2 * np.pi)) - np.pi
--- a/ToyTask/assets/clockwise.png
+++ b/ToyTask/assets/clockwise.png
--- a/runner/BOGymRunner.py
+++ b/runner/BOGymRunner.py
@ -1,11 +1,19 @@
 from BayesianOptimization.BOwithGym import BayesianOptimization
-from ToyTask.MountainCarGym import Continuous_MountainCarEnv
+
 import numpy as np
 import matplotlib.pyplot as plt
 # from ToyTask.MountainCarGym import Continuous_MountainCarEnv
 from ToyTask.Pendulum import PendulumEnv
 import warnings
 from sklearn.exceptions import ConvergenceWarning
 warnings.filterwarnings("ignore", category=ConvergenceWarning)
 # BO parameters
-env = Continuous_MountainCarEnv()
+env = PendulumEnv()
 nr_steps = 100
 acquisition_fun = 'ei'
 iteration_steps = 100
@ -17,6 +25,7 @@ finished_store = np.zeros((1, nr_runs))
 best_policy = np.zeros((nr_steps, nr_runs))
 reward_store = np.zeros((iteration_steps, nr_runs))
 # post-processing
 def post_processing(finished, policy, reward):
@ -31,6 +40,7 @@ def post_processing(finished, policy, reward):
    return finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std
 # plot functions
 def plot_policy(mean, std, fin_mean, fin_std):
    x = np.linspace(0, mean.shape[0], mean.shape[0])
@ -53,6 +63,7 @@ def plot_policy(mean, std, fin_mean, fin_std):
    plt.show()
 def plot_reward(mean, std):
    eps = np.linspace(0, mean.shape[0], mean.shape[0])
    plt.plot(eps, mean)
@ -65,12 +76,14 @@ def plot_reward(mean, std):
    )
    plt.show()
 # main
 def main():
    global finished_store, best_policy, reward_store
    bo = BayesianOptimization(env, nr_steps, acq=acquisition_fun)
    for i in range(nr_runs):
        print('Iteration:', str(i))
        bo.env_seed = int(np.random.randint(1, 2147483647, 1)[0])
        bo.initialize()
        for j in range(iteration_steps):
            x_next = bo.next_observation()
@ -82,11 +95,14 @@ def main():
        best_policy[:, i] = bo.policy_model.trajectory.T
        reward_store[:, i] = bo.best_reward.T
        print(reward_store[-1, i])
    finish_mean, finish_std, policy_mean, policy_std, reward_mean, reward_std = post_processing(finished_store,
                                                                                                best_policy,
                                                                                                reward_store)
    plot_policy(policy_mean, policy_std, finish_mean, finish_std)
    plot_reward(reward_mean, reward_std)
 if __name__ == '__main__':
-    main()
+    main()