diff --git a/src/active_bo_msgs/msg/ActiveBORequest.msg b/src/active_bo_msgs/msg/ActiveBORequest.msg index 7e63345..61c61ef 100644 --- a/src/active_bo_msgs/msg/ActiveBORequest.msg +++ b/src/active_bo_msgs/msg/ActiveBORequest.msg @@ -1,6 +1,8 @@ +string env +string metric uint16 nr_weights uint16 max_steps uint16 nr_episodes uint16 nr_runs string acquisition_function -float32 epsilon \ No newline at end of file +float32 metric_parameter \ No newline at end of file diff --git a/src/active_bo_msgs/msg/ActiveRL.msg b/src/active_bo_msgs/msg/ActiveRL.msg index 60dca7b..23b086f 100644 --- a/src/active_bo_msgs/msg/ActiveRL.msg +++ b/src/active_bo_msgs/msg/ActiveRL.msg @@ -1,2 +1,3 @@ +string env float32[] policy float32[] weights \ No newline at end of file diff --git a/src/active_bo_msgs/srv/RLRollOut.srv b/src/active_bo_msgs/srv/RLRollOut.srv index 00555f7..0581f5c 100644 --- a/src/active_bo_msgs/srv/RLRollOut.srv +++ b/src/active_bo_msgs/srv/RLRollOut.srv @@ -1,3 +1,4 @@ +string env float32[] policy --- float32 reward diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py index 4102e17..16534b1 100644 --- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py +++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py @@ -47,9 +47,12 @@ class BayesianOptimization: env_reward = 0.0 step_count = 0 + self.env.reset() + for i in range(len(policy)): action = policy[i] - output = self.env.step(action) + action_clipped = action.clip(min=-1.0, max=1.0) + output = self.env.step(action_clipped.astype(np.float32)) env_reward += output[1] done = output[2] diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py index e69de29..a3d3b6e 100644 --- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py @@ -0,0 +1,465 @@ +"""classic Acrobot task""" +from typing import Optional + +import numpy as np +from numpy import cos, pi, sin + +from gym import core, logger, spaces +from gym.error import DependencyNotInstalled + +__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" +__credits__ = [ + "Alborz Geramifard", + "Robert H. Klein", + "Christoph Dann", + "William Dabney", + "Jonathan P. How", +] +__license__ = "BSD 3-Clause" +__author__ = "Christoph Dann " + +# SOURCE: +# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py +from gym.envs.classic_control import utils + + +class AcrobotEnv(core.Env): + """ + ### Description + + The Acrobot environment is based on Sutton's work in + ["Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding"](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html) + and [Sutton and Barto's book](http://www.incompleteideas.net/book/the-book-2nd.html). + The system consists of two links connected linearly to form a chain, with one end of + the chain fixed. The joint between the two links is actuated. The goal is to apply + torques on the actuated joint to swing the free end of the linear chain above a + given height while starting from the initial state of hanging downwards. + + As seen in the **Gif**: two blue links connected by two green joints. The joint in + between the two links is actuated. The goal is to swing the free end of the outer-link + to reach the target height (black horizontal line above system) by applying torque on + the actuator. + + ### Action Space + + The action is discrete, deterministic, and represents the torque applied on the actuated + joint between the two links. + + | Num | Action | Unit | + |-----|---------------------------------------|--------------| + | 0 | apply -1 torque to the actuated joint | torque (N m) | + | 1 | apply 0 torque to the actuated joint | torque (N m) | + | 2 | apply 1 torque to the actuated joint | torque (N m) | + + ### Observation Space + + The observation is a `ndarray` with shape `(6,)` that provides information about the + two rotational joint angles as well as their angular velocities: + + | Num | Observation | Min | Max | + |-----|------------------------------|---------------------|-------------------| + | 0 | Cosine of `theta1` | -1 | 1 | + | 1 | Sine of `theta1` | -1 | 1 | + | 2 | Cosine of `theta2` | -1 | 1 | + | 3 | Sine of `theta2` | -1 | 1 | + | 4 | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) | + | 5 | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) | + + where + - `theta1` is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly + downwards. + - `theta2` is ***relative to the angle of the first link.*** + An angle of 0 corresponds to having the same angle between the two links. + + The angular velocities of `theta1` and `theta2` are bounded at ±4π, and ±9π rad/s respectively. + A state of `[1, 0, 1, 0, ..., ...]` indicates that both links are pointing downwards. + + ### Rewards + + The goal is to have the free end reach a designated target height in as few steps as possible, + and as such all steps that do not reach the goal incur a reward of -1. + Achieving the target height results in termination with a reward of 0. The reward threshold is -100. + + ### Starting State + + Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized + uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity. + + ### Episode End + + The episode ends if one of the following occurs: + 1. Termination: The free end reaches the target height, which is constructed as: + `-cos(theta1) - cos(theta2 + theta1) > 1.0` + 2. Truncation: Episode length is greater than 500 (200 for v0) + + ### Arguments + + No additional arguments are currently supported. + + ``` + env = gym.make('Acrobot-v1') + ``` + + By default, the dynamics of the acrobot follow those described in Sutton and Barto's book + [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/11/node4.html). + However, a `book_or_nips` parameter can be modified to change the pendulum dynamics to those described + in the original [NeurIPS paper](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html). + + ``` + # To change the dynamics as described above + env.env.book_or_nips = 'nips' + ``` + + See the following note and + the [implementation](https://github.com/openai/gym/blob/master/gym/envs/classic_control/acrobot.py) for details: + + > The dynamics equations were missing some terms in the NIPS paper which + are present in the book. R. Sutton confirmed in personal correspondence + that the experimental results shown in the paper and the book were + generated with the equations shown in the book. + However, there is the option to run the domain with the paper equations + by setting `book_or_nips = 'nips'` + + + ### Version History + + - v1: Maximum number of steps increased from 200 to 500. The observation space for v0 provided direct readings of + `theta1` and `theta2` in radians, having a range of `[-pi, pi]`. The v1 observation space as described here provides the + sine and cosine of each angle instead. + - v0: Initial versions release (1.0.0) (removed from gym for v1) + + ### References + - Sutton, R. S. (1996). Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding. + In D. Touretzky, M. C. Mozer, & M. Hasselmo (Eds.), Advances in Neural Information Processing Systems (Vol. 8). + MIT Press. https://proceedings.neurips.cc/paper/1995/file/8f1d43620bc6bb580df6e80b0dc05c48-Paper.pdf + - Sutton, R. S., Barto, A. G. (2018 ). Reinforcement Learning: An Introduction. The MIT Press. + """ + + metadata = { + "render_modes": ["human", "rgb_array"], + "render_fps": 15, + } + + dt = 0.2 + + LINK_LENGTH_1 = 1.0 # [m] + LINK_LENGTH_2 = 1.0 # [m] + LINK_MASS_1 = 1.0 #: [kg] mass of link 1 + LINK_MASS_2 = 1.0 #: [kg] mass of link 2 + LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 + LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 + LINK_MOI = 1.0 #: moments of inertia for both links + + MAX_VEL_1 = 4 * pi + MAX_VEL_2 = 9 * pi + + AVAIL_TORQUE = [-1.0, 0.0, +1] + + torque_noise_max = 0.0 + + SCREEN_DIM = 500 + + #: use dynamics equations from the nips paper or the book + book_or_nips = "book" + action_arrow = None + domain_fig = None + actions_num = 3 + + def __init__(self, render_mode: Optional[str] = None): + self.render_mode = render_mode + self.screen = None + self.clock = None + self.isopen = True + high = np.array( + [1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32 + ) + low = -high + self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32) + self.action_space = spaces.Discrete(3) + self.state = None + + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): + super().reset(seed=seed) + # Note that if you use custom reset bounds, it may lead to out-of-bound + # state/observations. + low, high = utils.maybe_parse_reset_bounds( + options, -0.1, 0.1 # default low + ) # default high + self.state = self.np_random.uniform(low=low, high=high, size=(4,)).astype( + np.float32 + ) + + if self.render_mode == "human": + self.render() + return self._get_ob(), {} + + def step(self, a): + s = self.state + assert s is not None, "Call reset before using AcrobotEnv object." + torque = self.AVAIL_TORQUE[a] + + # Add noise to the force action + if self.torque_noise_max > 0: + torque += self.np_random.uniform( + -self.torque_noise_max, self.torque_noise_max + ) + + # Now, augment the state with our force action so it can be passed to + # _dsdt + s_augmented = np.append(s, torque) + + ns = rk4(self._dsdt, s_augmented, [0, self.dt]) + + ns[0] = wrap(ns[0], -pi, pi) + ns[1] = wrap(ns[1], -pi, pi) + ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) + ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) + self.state = ns + terminated = self._terminal() + reward = -1.0 if not terminated else 0.0 + + if self.render_mode == "human": + self.render() + return (self._get_ob(), reward, terminated, False, {}) + + def _get_ob(self): + s = self.state + assert s is not None, "Call reset before using AcrobotEnv object." + return np.array( + [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32 + ) + + def _terminal(self): + s = self.state + assert s is not None, "Call reset before using AcrobotEnv object." + return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0) + + def _dsdt(self, s_augmented): + m1 = self.LINK_MASS_1 + m2 = self.LINK_MASS_2 + l1 = self.LINK_LENGTH_1 + lc1 = self.LINK_COM_POS_1 + lc2 = self.LINK_COM_POS_2 + I1 = self.LINK_MOI + I2 = self.LINK_MOI + g = 9.8 + a = s_augmented[-1] + s = s_augmented[:-1] + theta1 = s[0] + theta2 = s[1] + dtheta1 = s[2] + dtheta2 = s[3] + d1 = ( + m1 * lc1**2 + + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + + I1 + + I2 + ) + d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 + phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0) + phi1 = ( + -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) + - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) + + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2) + + phi2 + ) + if self.book_or_nips == "nips": + # the following line is consistent with the description in the + # paper + ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1) + else: + # the following line is consistent with the java implementation and the + # book + ddtheta2 = ( + a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 + ) / (m2 * lc2**2 + I2 - d2**2 / d1) + ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 + return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 + + def render(self): + if self.render_mode is None: + logger.warn( + "You are calling render method without specifying any render mode. " + "You can specify the render_mode at initialization, " + f'e.g. gym("{self.spec.id}", render_mode="rgb_array")' + ) + return + + try: + import pygame + from pygame import gfxdraw + except ImportError: + raise DependencyNotInstalled( + "pygame is not installed, run `pip install gym[classic_control]`" + ) + + if self.screen is None: + pygame.init() + if self.render_mode == "human": + pygame.display.init() + self.screen = pygame.display.set_mode( + (self.SCREEN_DIM, self.SCREEN_DIM) + ) + else: # mode in "rgb_array" + self.screen = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM)) + if self.clock is None: + self.clock = pygame.time.Clock() + + surf = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM)) + surf.fill((255, 255, 255)) + s = self.state + + bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default + scale = self.SCREEN_DIM / (bound * 2) + offset = self.SCREEN_DIM / 2 + + if s is None: + return None + + p1 = [ + -self.LINK_LENGTH_1 * cos(s[0]) * scale, + self.LINK_LENGTH_1 * sin(s[0]) * scale, + ] + + p2 = [ + p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]) * scale, + p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]) * scale, + ] + + xys = np.array([[0, 0], p1, p2])[:, ::-1] + thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2] + link_lengths = [self.LINK_LENGTH_1 * scale, self.LINK_LENGTH_2 * scale] + + pygame.draw.line( + surf, + start_pos=(-2.2 * scale + offset, 1 * scale + offset), + end_pos=(2.2 * scale + offset, 1 * scale + offset), + color=(0, 0, 0), + ) + + for ((x, y), th, llen) in zip(xys, thetas, link_lengths): + x = x + offset + y = y + offset + l, r, t, b = 0, llen, 0.1 * scale, -0.1 * scale + coords = [(l, b), (l, t), (r, t), (r, b)] + transformed_coords = [] + for coord in coords: + coord = pygame.math.Vector2(coord).rotate_rad(th) + coord = (coord[0] + x, coord[1] + y) + transformed_coords.append(coord) + gfxdraw.aapolygon(surf, transformed_coords, (0, 204, 204)) + gfxdraw.filled_polygon(surf, transformed_coords, (0, 204, 204)) + + gfxdraw.aacircle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0)) + gfxdraw.filled_circle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0)) + + surf = pygame.transform.flip(surf, False, True) + self.screen.blit(surf, (0, 0)) + + if self.render_mode == "human": + pygame.event.pump() + self.clock.tick(self.metadata["render_fps"]) + pygame.display.flip() + + elif self.render_mode == "rgb_array": + return np.transpose( + np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) + ) + + def close(self): + if self.screen is not None: + import pygame + + pygame.display.quit() + pygame.quit() + self.isopen = False + + +def wrap(x, m, M): + """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which + truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n + For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. + + Args: + x: a scalar + m: minimum possible value in range + M: maximum possible value in range + + Returns: + x: a scalar, wrapped + """ + diff = M - m + while x > M: + x = x - diff + while x < m: + x = x + diff + return x + + +def bound(x, m, M=None): + """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* + have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1]. + + Args: + x: scalar + m: The lower bound + M: The upper bound + + Returns: + x: scalar, bound between min (m) and Max (M) + """ + if M is None: + M = m[1] + m = m[0] + # bound x between min (m) and Max (M) + return min(max(x, m), M) + + +def rk4(derivs, y0, t): + """ + Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta. + + Example for 2D system: + + >>> def derivs(x): + ... d1 = x[0] + 2*x[1] + ... d2 = -3*x[0] + 4*x[1] + ... return d1, d2 + + >>> dt = 0.0005 + >>> t = np.arange(0.0, 2.0, dt) + >>> y0 = (1,2) + >>> yout = rk4(derivs, y0, t) + + Args: + derivs: the derivative of the system and has the signature ``dy = derivs(yi)`` + y0: initial state vector + t: sample times + + Returns: + yout: Runge-Kutta approximation of the ODE + """ + + try: + Ny = len(y0) + except TypeError: + yout = np.zeros((len(t),), np.float_) + else: + yout = np.zeros((len(t), Ny), np.float_) + + yout[0] = y0 + + for i in np.arange(len(t) - 1): + + this = t[i] + dt = t[i + 1] - this + dt2 = dt / 2.0 + y0 = yout[i] + + k1 = np.asarray(derivs(y0)) + k2 = np.asarray(derivs(y0 + dt2 * k1)) + k3 = np.asarray(derivs(y0 + dt2 * k2)) + k4 = np.asarray(derivs(y0 + dt * k3)) + yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) + # We only care about the final timestep and we cleave off action value which will be zero + return yout[-1][:4] \ No newline at end of file diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py index 9f472dc..44d6ca0 100644 --- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py @@ -123,9 +123,9 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): self.steps_beyond_terminated = None def step(self, action): - err_msg = f"{action!r} ({type(action)}) invalid" - assert self.action_space.contains(action), err_msg - assert self.state is not None, "Call reset before using step method." + # err_msg = f"{action!r} ({type(action)}) invalid" + # assert self.action_space.contains(action), err_msg + # assert self.state is not None, "Call reset before using step method." x, x_dot, theta, theta_dot = self.state # changed usage of action due to policy shaping approach force = action * self.force_mag @@ -153,7 +153,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): theta_dot = theta_dot + self.tau * thetaacc theta = theta + self.tau * theta_dot - self.state = (x, x_dot[0], theta, theta_dot[0]) + self.state = (x, x_dot, theta, theta_dot) terminated = bool( x < -self.x_threshold @@ -181,7 +181,6 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): if self.render_mode == "human": self.render() - return np.array(self.state, dtype=np.float32), reward, terminated, False, {} def reset( diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py index 758e676..79b7665 100644 --- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py @@ -126,7 +126,7 @@ class PendulumEnv(gym.Env): u = 2 * u # scaling the action to +/- 2 Nm - u = np.clip(u, -self.max_torque, self.max_torque)[0] + u = np.clip(u, -self.max_torque, self.max_torque) self.last_u = u # for rendering costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2) diff --git a/src/active_bo_ros/active_bo_ros/active_bo_topic.py b/src/active_bo_ros/active_bo_ros/active_bo_topic.py index 161033a..5dc1a2f 100644 --- a/src/active_bo_ros/active_bo_ros/active_bo_topic.py +++ b/src/active_bo_ros/active_bo_ros/active_bo_topic.py @@ -9,7 +9,11 @@ from rclpy.node import Node from rclpy.callback_groups import ReentrantCallbackGroup from active_bo_ros.BayesianOptimization.BayesianOptimization import BayesianOptimization + from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv +from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv +from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv +from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv import numpy as np import time @@ -34,12 +38,13 @@ class ActiveBOTopic(Node): 1, callback_group=bo_callback_group) self.active_bo_pending = False + self.bo_env = None self.bo_nr_weights = None self.bo_steps = None self.bo_episodes = None self.bo_runs = None self.bo_acq_fcn = None - self.bo_epsilon = None + self.bo_metric_parameter = None self.current_run = 0 self.current_episode = 0 @@ -58,7 +63,7 @@ class ActiveBOTopic(Node): self.rl_reward = None # RL Environments and BO - self.env = Continuous_MountainCarEnv() + self.env = None self.distance_penalty = 0 self.BO = None @@ -75,12 +80,13 @@ class ActiveBOTopic(Node): callback_group=mainloop_callback_group) def reset_bo_request(self): + self.bo_env = None self.bo_nr_weights = None self.bo_steps = None self.bo_episodes = None self.bo_runs = None self.bo_acq_fcn = None - self.bo_epsilon = None + self.bo_metric_parameter = None self.current_run = 0 self.current_episode = 0 @@ -88,12 +94,13 @@ class ActiveBOTopic(Node): if not self.active_bo_pending: self.get_logger().info('Active Bayesian Optimization request pending!') self.active_bo_pending = True + self.bo_env = msg.env self.bo_nr_weights = msg.nr_weights self.bo_steps = msg.max_steps self.bo_episodes = msg.nr_episodes self.bo_runs = msg.nr_runs self.bo_acq_fcn = msg.acquisition_function - self.bo_epsilon = msg.epsilon + self.bo_metric_parameter = msg.metric_parameter # initialize self.reward = np.zeros((self.bo_episodes, self.bo_runs)) @@ -116,6 +123,18 @@ class ActiveBOTopic(Node): def mainloop_callback(self): if self.active_bo_pending: + # set rl environment + if self.bo_env == "Mountain Car": + self.env = Continuous_MountainCarEnv() + elif self.bo_env == "Cartpole": + self.env = CartPoleEnv() + elif self.bo_env == "Acrobot": + self.env = AcrobotEnv() + elif self.bo_env == "Pendulum": + self.env = PendulumEnv() + else: + raise NotImplementedError + if self.BO is None: self.BO = BayesianOptimization(self.env, self.bo_steps, @@ -139,6 +158,7 @@ class ActiveBOTopic(Node): self.active_bo_pub.publish(bo_response) self.reset_bo_request() self.active_bo_pending = False + self.BO = None else: if self.active_rl_pending: @@ -151,10 +171,11 @@ class ActiveBOTopic(Node): self.get_logger().error(f'Active Reinforcement Learning failed to add new observation: {e}') else: if self.current_episode < self.bo_episodes: - if np.random.uniform(0.0, 1.0, 1) < self.bo_epsilon: + if np.random.uniform(0.0, 1.0, 1) < self.bo_metric_parameter: active_rl_request = ActiveRL() old_policy, _, old_weights = self.BO.get_best_result() + active_rl_request.env = self.bo_env active_rl_request.policy = old_policy.tolist() active_rl_request.weights = old_weights.tolist() diff --git a/src/active_bo_ros/active_bo_ros/active_rl_topic.py b/src/active_bo_ros/active_bo_ros/active_rl_topic.py index 535ce65..582d7a6 100644 --- a/src/active_bo_ros/active_bo_ros/active_rl_topic.py +++ b/src/active_bo_ros/active_bo_ros/active_rl_topic.py @@ -9,6 +9,10 @@ from rclpy.node import Node from rclpy.callback_groups import ReentrantCallbackGroup from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv +from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv +from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv +from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv + import numpy as np import time @@ -32,6 +36,7 @@ class ActiveRLService(Node): 1, callback_group=rl_callback_group) self.active_rl_pending = False + self.rl_env = None self.rl_policy = None self.rl_weights = None self.rl_reward = 0.0 @@ -58,7 +63,8 @@ class ActiveRLService(Node): self.eval_weights = None # RL Environments - self.env = Continuous_MountainCarEnv(render_mode='rgb_array') + self.env = None + self.distance_penalty = 0 self.best_pol_shown = False @@ -69,13 +75,26 @@ class ActiveRLService(Node): callback_group=mainloop_callback_group) def reset_rl_request(self): + self.rl_env = None self.rl_policy = None self.rl_weights = None def active_rl_callback(self, msg): - self.rl_policy = msg.policy + self.rl_env = msg.env + self.rl_policy = np.array(msg.policy, dtype=np.float32) self.rl_weights = msg.weights + if self.rl_env == "Mountain Car": + self.env = Continuous_MountainCarEnv(render_mode="rgb_array") + elif self.rl_env == "Cartpole": + self.env = CartPoleEnv(render_mode="rgb_array") + elif self.rl_env == "Acrobot": + self.env = AcrobotEnv(render_mode="rgb_array") + elif self.rl_env == "Pendulum": + self.env = PendulumEnv(render_mode="rgb_array") + else: + raise NotImplementedError + self.get_logger().info('Active RL: Called!') self.env.reset() self.active_rl_pending = True @@ -85,15 +104,20 @@ class ActiveRLService(Node): self.eval_weights = None def active_rl_eval_callback(self, msg): - self.eval_policy = msg.policy + self.eval_policy = np.array(msg.policy, dtype=np.float32) self.eval_weights = msg.weights self.get_logger().info('Active RL Eval: Responded!') + self.env.reset() self.eval_response_received = True def next_image(self, policy): action = policy[self.rl_step] - output = self.env.step(action) + action_clipped = action.clip(min=-1.0, max=1.0) + self.get_logger().info(str(action_clipped) + str(type(action_clipped))) + output = self.env.step(action_clipped.astype(np.float32)) + + self.get_logger().info(str(output)) self.rl_reward += output[1] done = output[2] @@ -133,7 +157,7 @@ class ActiveRLService(Node): self.rl_reward = 0.0 eval_request = ActiveRL() - eval_request.policy = self.rl_policy + eval_request.policy = self.rl_policy.tolist() eval_request.weights = self.rl_weights self.eval_pub.publish(eval_request) diff --git a/src/active_bo_ros/active_bo_ros/rl_service.py b/src/active_bo_ros/active_bo_ros/rl_service.py index 67f26f6..e259384 100644 --- a/src/active_bo_ros/active_bo_ros/rl_service.py +++ b/src/active_bo_ros/active_bo_ros/rl_service.py @@ -5,6 +5,9 @@ import rclpy from rclpy.node import Node from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv +from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv +from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv +from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv import numpy as np @@ -16,7 +19,7 @@ class RLService(Node): self.publisher = self.create_publisher(ImageFeedback, 'rl_feedback', 1) - self.env = Continuous_MountainCarEnv(render_mode='rgb_array') + self.env = None self.distance_penalty = 0 def rl_callback(self, request, response): @@ -25,13 +28,27 @@ class RLService(Node): reward = 0 step_count = 0 - policy = request.policy + policy = np.array(request.policy, dtype=np.float32) + rl_env = request.env + + if rl_env == "Mountain Car": + self.env = Continuous_MountainCarEnv(render_mode="rgb_array") + elif rl_env == "Cartpole": + self.env = CartPoleEnv(render_mode="rgb_array") + elif rl_env == "Acrobot": + self.env = AcrobotEnv(render_mode="rgb_array") + elif rl_env == "Pendulum": + self.env = PendulumEnv(render_mode="rgb_array") + else: + raise NotImplementedError self.env.reset() for i in range(len(policy)): action = policy[i] - output = self.env.step(action) + action_clipped = action.clip(min=-1.0, max=1.0) + self.get_logger().info(str(action_clipped) + str(type(action_clipped))) + output = self.env.step(action_clipped.astype(np.float32)) reward += output[1] done = output[2] @@ -64,6 +81,7 @@ class RLService(Node): return response + def main(args=None): rclpy.init(args=args) @@ -73,5 +91,6 @@ def main(args=None): rclpy.shutdown() + if __name__ == '__main__': main()