From 1f4f8787839b314819b1ece4e97a93862b4123c3 Mon Sep 17 00:00:00 2001 From: "nikolaus.feith" Date: Fri, 3 Feb 2023 14:05:03 +0100 Subject: [PATCH] MountainCarGym.py added --- .idea/ActiveBOToytask.iml | 4 + BayesianOptimization/BOwithGym.py | 12 +- ToyTask/MountainCarGym.py | 301 ++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 5 deletions(-) create mode 100644 ToyTask/MountainCarGym.py diff --git a/.idea/ActiveBOToytask.iml b/.idea/ActiveBOToytask.iml index 22a7bc1..fa79220 100644 --- a/.idea/ActiveBOToytask.iml +++ b/.idea/ActiveBOToytask.iml @@ -7,4 +7,8 @@ + + \ No newline at end of file diff --git a/BayesianOptimization/BOwithGym.py b/BayesianOptimization/BOwithGym.py index 103f544..e114b70 100644 --- a/BayesianOptimization/BOwithGym.py +++ b/BayesianOptimization/BOwithGym.py @@ -5,13 +5,15 @@ from sklearn.gaussian_process.kernels import Matern from PolicyModel.GaussianModel import GaussianPolicy from AcquistionFunctions.ExpectedImprovement import ExpectedImprovement +from ToyTask.MountainCarGym import Continuous_MountainCarEnv + import gym import time import matplotlib.pyplot as plt class BayesianOptimization: - def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=8, policy_seed=None): + def __init__(self, env, nr_step, nr_init=3, acq='ei', nr_weights=6, policy_seed=None): self.env = env self.nr_init = nr_init self.acq = acq @@ -21,7 +23,7 @@ class BayesianOptimization: self.episode = 0 self.best_reward = np.empty((1, 1)) - self.distance_penalty = 10 + self.distance_penalty = 100 self.nr_policy_weights = nr_weights self.nr_steps = nr_step @@ -115,11 +117,11 @@ class BayesianOptimization: def main(): - nr_steps = 80 - env = gym.envs.make('MountainCarContinuous-v0', render_mode="human") + nr_steps = 100 + env = Continuous_MountainCarEnv(render_mode='human') bo = BayesianOptimization(env, nr_steps) bo.initialize() - iteration_steps = 100 + iteration_steps = 200 for i in range(iteration_steps): x_next = bo.next_observation() bo.eval_new_observation(x_next) diff --git a/ToyTask/MountainCarGym.py b/ToyTask/MountainCarGym.py new file mode 100644 index 0000000..7da049b --- /dev/null +++ b/ToyTask/MountainCarGym.py @@ -0,0 +1,301 @@ +""" +@author: Olivier Sigaud + +A merge between two sources: + +* Adaptation of the MountainCar Environment from the "FAReinforcement" library +of Jose Antonio Martin H. (version 1.0), adapted by 'Tom Schaul, tom@idsia.ch' +and then modified by Arnaud de Broissia + +* the gym MountainCar environment +itself from +http://incompleteideas.net/sutton/MountainCar/MountainCar1.cp +permalink: https://perma.cc/6Z2N-PFWC +""" + +import math +from typing import Optional + +import numpy as np + +import gym +from gym import spaces +from gym.envs.classic_control import utils +from gym.error import DependencyNotInstalled + + +class Continuous_MountainCarEnv(gym.Env): + """ + ### Description + + The Mountain Car MDP is a deterministic MDP that consists of a car placed stochastically + at the bottom of a sinusoidal valley, with the only possible actions being the accelerations + that can be applied to the car in either direction. The goal of the MDP is to strategically + accelerate the car to reach the goal state on top of the right hill. There are two versions + of the mountain car domain in gym: one with discrete actions and one with continuous. + This version is the one with continuous actions. + + This MDP first appeared in [Andrew Moore's PhD Thesis (1990)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-209.pdf) + + ``` + @TECHREPORT{Moore90efficientmemory-based, + author = {Andrew William Moore}, + title = {Efficient Memory-based Learning for Robot Control}, + institution = {University of Cambridge}, + year = {1990} + } + ``` + + ### Observation Space + + The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Unit | + |-----|--------------------------------------|------|-----|--------------| + | 0 | position of the car along the x-axis | -Inf | Inf | position (m) | + | 1 | velocity of the car | -Inf | Inf | position (m) | + + ### Action Space + + The action is a `ndarray` with shape `(1,)`, representing the directional force applied on the car. + The action is clipped in the range `[-1,1]` and multiplied by a power of 0.0015. + + ### Transition Dynamics: + + Given an action, the mountain car follows the following transition dynamics: + + *velocityt+1 = velocityt+1 + force * self.power - 0.0025 * cos(3 * positiont)* + + *positiont+1 = positiont + velocityt+1* + + where force is the action clipped to the range `[-1,1]` and power is a constant 0.0015. + The collisions at either end are inelastic with the velocity set to 0 upon collision with the wall. + The position is clipped to the range [-1.2, 0.6] and velocity is clipped to the range [-0.07, 0.07]. + + ### Reward + + A negative reward of *-0.1 * action2* is received at each timestep to penalise for + taking actions of large magnitude. If the mountain car reaches the goal then a positive reward of +100 + is added to the negative reward for that timestep. + + ### Starting State + + The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`. + The starting velocity of the car is always assigned to 0. + + ### Episode End + + The episode ends if either of the following happens: + 1. Termination: The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill) + 2. Truncation: The length of the episode is 999. + + ### Arguments + + ``` + gym.make('MountainCarContinuous-v0') + ``` + + ### Version History + + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": ["human", "rgb_array"], + "render_fps": 30, + } + + def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): + self.min_action = -1.0 + self.max_action = 1.0 + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = ( + 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version + ) + self.goal_velocity = goal_velocity + self.power = 0.0015 + + self.low_state = np.array( + [self.min_position, -self.max_speed], dtype=np.float32 + ) + self.high_state = np.array( + [self.max_position, self.max_speed], dtype=np.float32 + ) + + self.render_mode = render_mode + + self.screen_width = 600 + self.screen_height = 400 + self.screen = None + self.clock = None + self.isopen = True + + self.action_space = spaces.Box( + low=self.min_action, high=self.max_action, shape=(1,), dtype=np.float32 + ) + self.observation_space = spaces.Box( + low=self.low_state, high=self.high_state, dtype=np.float32 + ) + + def step(self, action: np.ndarray): + + position = self.state[0] + velocity = self.state[1] + force = min(max(action[0], self.min_action), self.max_action) + + velocity += force * self.power - 0.0025 * math.cos(3 * position) + if velocity > self.max_speed: + velocity = self.max_speed + if velocity < -self.max_speed: + velocity = -self.max_speed + position += velocity + if position > self.max_position: + position = self.max_position + if position < self.min_position: + position = self.min_position + if position == self.min_position and velocity < 0: + velocity = 0 + + # Convert a possible numpy bool to a Python bool. + terminated = bool( + position >= self.goal_position and velocity >= self.goal_velocity + ) + + reward = 0 + if terminated: + reward += 10 + reward -= math.pow(action[0], 2) * 0.1 + reward -= 1 + + self.state = np.array([position, velocity], dtype=np.float32) + + if self.render_mode == "human": + self.render() + return self.state, reward, terminated, False, {} + + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): + super().reset(seed=seed) + # Note that if you use custom reset bounds, it may lead to out-of-bound + # state/observations. + low, high = utils.maybe_parse_reset_bounds(options, -0.6, -0.4) + self.state = np.array([self.np_random.uniform(low=low, high=high), 0]) + + if self.render_mode == "human": + self.render() + return np.array(self.state, dtype=np.float32), {} + + def _height(self, xs): + return np.sin(3 * xs) * 0.45 + 0.55 + + def render(self): + if self.render_mode is None: + gym.logger.warn( + "You are calling render method without specifying any render mode. " + "You can specify the render_mode at initialization, " + f'e.g. gym("{self.spec.id}", render_mode="rgb_array")' + ) + return + + try: + import pygame + from pygame import gfxdraw + except ImportError: + raise DependencyNotInstalled( + "pygame is not installed, run `pip install gym[classic_control]`" + ) + + if self.screen is None: + pygame.init() + if self.render_mode == "human": + pygame.display.init() + self.screen = pygame.display.set_mode( + (self.screen_width, self.screen_height) + ) + else: # mode == "rgb_array": + self.screen = pygame.Surface((self.screen_width, self.screen_height)) + if self.clock is None: + self.clock = pygame.time.Clock() + + world_width = self.max_position - self.min_position + scale = self.screen_width / world_width + carwidth = 40 + carheight = 20 + + self.surf = pygame.Surface((self.screen_width, self.screen_height)) + self.surf.fill((255, 255, 255)) + + pos = self.state[0] + + xs = np.linspace(self.min_position, self.max_position, 100) + ys = self._height(xs) + xys = list(zip((xs - self.min_position) * scale, ys * scale)) + + pygame.draw.aalines(self.surf, points=xys, closed=False, color=(0, 0, 0)) + + clearance = 10 + + l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0 + coords = [] + for c in [(l, b), (l, t), (r, t), (r, b)]: + c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) + coords.append( + ( + c[0] + (pos - self.min_position) * scale, + c[1] + clearance + self._height(pos) * scale, + ) + ) + + gfxdraw.aapolygon(self.surf, coords, (0, 0, 0)) + gfxdraw.filled_polygon(self.surf, coords, (0, 0, 0)) + + for c in [(carwidth / 4, 0), (-carwidth / 4, 0)]: + c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) + wheel = ( + int(c[0] + (pos - self.min_position) * scale), + int(c[1] + clearance + self._height(pos) * scale), + ) + + gfxdraw.aacircle( + self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128) + ) + gfxdraw.filled_circle( + self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128) + ) + + flagx = int((self.goal_position - self.min_position) * scale) + flagy1 = int(self._height(self.goal_position) * scale) + flagy2 = flagy1 + 50 + gfxdraw.vline(self.surf, flagx, flagy1, flagy2, (0, 0, 0)) + + gfxdraw.aapolygon( + self.surf, + [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], + (204, 204, 0), + ) + gfxdraw.filled_polygon( + self.surf, + [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], + (204, 204, 0), + ) + + self.surf = pygame.transform.flip(self.surf, False, True) + self.screen.blit(self.surf, (0, 0)) + if self.render_mode == "human": + pygame.event.pump() + self.clock.tick(self.metadata["render_fps"]) + pygame.display.flip() + + elif self.render_mode == "rgb_array": + return np.transpose( + np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) + ) + + def close(self): + if self.screen is not None: + import pygame + + pygame.display.quit() + pygame.quit() + self.isopen = False \ No newline at end of file