diff --git a/src/active_bo_msgs/CMakeLists.txt b/src/active_bo_msgs/CMakeLists.txt index b100134..affd789 100644 --- a/src/active_bo_msgs/CMakeLists.txt +++ b/src/active_bo_msgs/CMakeLists.txt @@ -21,6 +21,7 @@ find_package(rosidl_default_generators REQUIRED) rosidl_generate_interfaces(${PROJECT_NAME} "srv/WeightToPolicy.srv" + "action/RLRollOut.action" ) if(BUILD_TESTING) diff --git a/src/active_bo_msgs/action/RLRollOut.action b/src/active_bo_msgs/action/RLRollOut.action new file mode 100644 index 0000000..2311ae5 --- /dev/null +++ b/src/active_bo_msgs/action/RLRollOut.action @@ -0,0 +1,10 @@ +float32[] policy +--- +float32 reward +uint16 final_step +--- +int16 height +int16 width +uint8[] red +uint8[] green +uint8[] blue \ No newline at end of file diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py new file mode 100644 index 0000000..a2c31fb --- /dev/null +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/ContinuousMountainCar.py @@ -0,0 +1,307 @@ +""" +@author: Olivier Sigaud + +A merge between two sources: + +* Adaptation of the MountainCar Environment from the "FAReinforcement" library +of Jose Antonio Martin H. (version 1.0), adapted by 'Tom Schaul, tom@idsia.ch' +and then modified by Arnaud de Broissia + +* the gym MountainCar environment +itself from +http://incompleteideas.net/sutton/MountainCar/MountainCar1.cp +permalink: https://perma.cc/6Z2N-PFWC +""" + +import math +from typing import Optional + +import numpy as np + +import gym +from gym import spaces +from gym.envs.classic_control import utils +from gym.error import DependencyNotInstalled + + +class Continuous_MountainCarEnv(gym.Env): + """ + ### Description + + The Mountain Car MDP is a deterministic MDP that consists of a car placed stochastically + at the bottom of a sinusoidal valley, with the only possible actions being the accelerations + that can be applied to the car in either direction. The goal of the MDP is to strategically + accelerate the car to reach the goal state on top of the right hill. There are two versions + of the mountain car domain in gym: one with discrete actions and one with continuous. + This version is the one with continuous actions. + + This MDP first appeared in [Andrew Moore's PhD Thesis (1990)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-209.pdf) + + ``` + @TECHREPORT{Moore90efficientmemory-based, + author = {Andrew William Moore}, + title = {Efficient Memory-based Learning for Robot Control}, + institution = {University of Cambridge}, + year = {1990} + } + ``` + + ### Observation Space + + The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Unit | + |-----|--------------------------------------|------|-----|--------------| + | 0 | position of the car along the x-axis | -Inf | Inf | position (m) | + | 1 | velocity of the car | -Inf | Inf | position (m) | + + ### Action Space + + The action is a `ndarray` with shape `(1,)`, representing the directional force applied on the car. + The action is clipped in the range `[-1,1]` and multiplied by a power of 0.0015. + + ### Transition Dynamics: + + Given an action, the mountain car follows the following transition dynamics: + + *velocityt+1 = velocityt+1 + force * self.power - 0.0025 * cos(3 * positiont)* + + *positiont+1 = positiont + velocityt+1* + + where force is the action clipped to the range `[-1,1]` and power is a constant 0.0015. + The collisions at either end are inelastic with the velocity set to 0 upon collision with the wall. + The position is clipped to the range [-1.2, 0.6] and velocity is clipped to the range [-0.07, 0.07]. + + ### Reward + + A negative reward of *-0.1 * action2* is received at each timestep to penalise for + taking actions of large magnitude. If the mountain car reaches the goal then a positive reward of +100 + is added to the negative reward for that timestep. + + ### Starting State + + The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`. + The starting velocity of the car is always assigned to 0. + + ### Episode End + + The episode ends if either of the following happens: + 1. Termination: The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill) + 2. Truncation: The length of the episode is 999. + + ### Arguments + + ``` + gym.make('MountainCarContinuous-v0') + ``` + + ### Version History + + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": ["human", "rgb_array"], + "render_fps": 30, + } + + def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): + self.min_action = -1.0 + self.max_action = 1.0 + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = ( + 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version + ) + self.goal_velocity = goal_velocity + self.power = 0.0015 + + self.low_state = np.array( + [self.min_position, -self.max_speed], dtype=np.float32 + ) + self.high_state = np.array( + [self.max_position, self.max_speed], dtype=np.float32 + ) + + self.render_mode = render_mode + + self.screen_width = 600 + self.screen_height = 400 + self.screen = None + self.clock = None + self.isopen = True + + self.action_space = spaces.Box( + low=self.min_action, high=self.max_action, shape=(1,), dtype=np.float32 + ) + self.observation_space = spaces.Box( + low=self.low_state, high=self.high_state, dtype=np.float32 + ) + + def step(self, action: np.ndarray): + + position = self.state[0] + velocity = self.state[1] + force = min(max(action[0], self.min_action), self.max_action) + + velocity += force * self.power - 0.0025 * math.cos(3 * position) + if velocity > self.max_speed: + velocity = self.max_speed + if velocity < -self.max_speed: + velocity = -self.max_speed + position += velocity + if position > self.max_position: + position = self.max_position + if position < self.min_position: + position = self.min_position + if position == self.min_position and velocity < 0: + velocity = 0 + + # Convert a possible numpy bool to a Python bool. + terminated = bool( + position >= self.goal_position and velocity >= self.goal_velocity + ) + + reward = 0 + if terminated: + reward += 10 + reward -= math.pow(action[0], 2) * 0.1 + reward -= 1 + + self.state = np.array([position, velocity], dtype=np.float32) + + if self.render_mode == "human": + self.render() + elif self.render_mode == "rgb_array": + rgb_array = self.render() + return self.state, reward, terminated, False, {}, rgb_array + return self.state, reward, terminated, False, {} + + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): + super().reset(seed=seed) + # Note that if you use custom reset bounds, it may lead to out-of-bound + # state/observations. + low, high = utils.maybe_parse_reset_bounds(options, -0.6, -0.4) + self.state = np.array([self.np_random.uniform(low=low, high=high), 0]) + + if self.render_mode == "human": + self.render() + elif self.render_mode == "rgb_array": + rgb_array = self.render() + return np.array(self.state, dtype=np.float32), {}, rgb_array + return np.array(self.state, dtype=np.float32), {} + + def _height(self, xs): + return np.sin(3 * xs) * 0.45 + 0.55 + + def render(self): + if self.render_mode is None: + gym.logger.warn( + "You are calling render method without specifying any render mode. " + "You can specify the render_mode at initialization, " + f'e.g. gym("{self.spec.id}", render_mode="rgb_array")' + ) + return + + try: + import pygame + from pygame import gfxdraw + except ImportError: + raise DependencyNotInstalled( + "pygame is not installed, run `pip install gym[classic_control]`" + ) + + if self.screen is None: + pygame.init() + if self.render_mode == "human": + pygame.display.init() + self.screen = pygame.display.set_mode( + (self.screen_width, self.screen_height) + ) + else: # mode == "rgb_array": + self.screen = pygame.Surface((self.screen_width, self.screen_height)) + if self.clock is None: + self.clock = pygame.time.Clock() + + world_width = self.max_position - self.min_position + scale = self.screen_width / world_width + carwidth = 40 + carheight = 20 + + self.surf = pygame.Surface((self.screen_width, self.screen_height)) + self.surf.fill((255, 255, 255)) + + pos = self.state[0] + + xs = np.linspace(self.min_position, self.max_position, 100) + ys = self._height(xs) + xys = list(zip((xs - self.min_position) * scale, ys * scale)) + + pygame.draw.aalines(self.surf, points=xys, closed=False, color=(0, 0, 0)) + + clearance = 10 + + l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0 + coords = [] + for c in [(l, b), (l, t), (r, t), (r, b)]: + c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) + coords.append( + ( + c[0] + (pos - self.min_position) * scale, + c[1] + clearance + self._height(pos) * scale, + ) + ) + + gfxdraw.aapolygon(self.surf, coords, (0, 0, 0)) + gfxdraw.filled_polygon(self.surf, coords, (0, 0, 0)) + + for c in [(carwidth / 4, 0), (-carwidth / 4, 0)]: + c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) + wheel = ( + int(c[0] + (pos - self.min_position) * scale), + int(c[1] + clearance + self._height(pos) * scale), + ) + + gfxdraw.aacircle( + self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128) + ) + gfxdraw.filled_circle( + self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128) + ) + + flagx = int((self.goal_position - self.min_position) * scale) + flagy1 = int(self._height(self.goal_position) * scale) + flagy2 = flagy1 + 50 + gfxdraw.vline(self.surf, flagx, flagy1, flagy2, (0, 0, 0)) + + gfxdraw.aapolygon( + self.surf, + [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], + (204, 204, 0), + ) + gfxdraw.filled_polygon( + self.surf, + [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], + (204, 204, 0), + ) + + self.surf = pygame.transform.flip(self.surf, False, True) + self.screen.blit(self.surf, (0, 0)) + if self.render_mode == "human": + pygame.event.pump() + self.clock.tick(self.metadata["render_fps"]) + pygame.display.flip() + + elif self.render_mode == "rgb_array": + return np.transpose( + np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) + ) + + def close(self): + if self.screen is not None: + import pygame + + pygame.display.quit() + pygame.quit() + self.isopen = False \ No newline at end of file diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/__init__.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/active_bo_ros/active_bo_ros/rl_action_server.py b/src/active_bo_ros/active_bo_ros/rl_action_server.py new file mode 100644 index 0000000..3cbebaa --- /dev/null +++ b/src/active_bo_ros/active_bo_ros/rl_action_server.py @@ -0,0 +1,71 @@ +from active_bo_msgs.action import RLRollOut + +import rclpy +from rclpy.node import Node +from rclpy.action import ActionServer + +from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv + +import numpy as np +import time + +class RLActionServer(Node): + def __init__(self): + super().__init__('rl_action') + self.action_server = ActionServer(self, + RLRollOut, + 'reinforcement_learning_action', + self.execute_callback) + + self.env = Continuous_MountainCarEnv(render_mode='rgb_array') + self.distance_penalty = 0 + + def execute_callback(self, action_handle): + + feedback_msg = RLRollOut.Feedback() + + reward = 0 + step_count = 0 + policy = action_handle.request.policy + + for i in range(len(policy)): + action = policy[i] + output = self.env.step(action) + + reward += output[1] + done = output[2] + step_count += 1 + + rgb_array = output[5] + rgb_shape = rgb_array.shape + + feedback_msg.height = rgb_shape[0] + feedback_msg.width = rgb_shape[1] + + action_handle.publish_feedback(feedback_msg) + + if done: + break + + distance = -(self.env.goal_position - output[0][0]) + reward += distance * self.distance_penalty + + time.sleep(0.01) + + action_handle.succeed() + + result = RLRollOut.Result() + result.reward = reward + result.final_step = step_count + + return result + +def main(args=None): + rclpy.init(args=args) + + rl_action_server = RLActionServer() + + rclpy.spin(rl_action_server) + +if __name__ == '__main__': + main() diff --git a/src/active_bo_ros/launch/rl_action.launch.py b/src/active_bo_ros/launch/rl_action.launch.py new file mode 100755 index 0000000..67b95ee --- /dev/null +++ b/src/active_bo_ros/launch/rl_action.launch.py @@ -0,0 +1,11 @@ +from launch import LaunchDescription +from launch_ros.actions import Node + +def generate_launch_description(): + return LaunchDescription([ + Node( + package='active_bo_ros', + executable='rl_action', + name='rl_action' + ), + ]) \ No newline at end of file diff --git a/src/active_bo_ros/setup.py b/src/active_bo_ros/setup.py index 8154794..a15bcc3 100644 --- a/src/active_bo_ros/setup.py +++ b/src/active_bo_ros/setup.py @@ -7,14 +7,16 @@ package_name = 'active_bo_ros' setup( name=package_name, version='0.0.0', - packages=[package_name, package_name+'/PolicyModel'], + packages=[package_name, + package_name+'/PolicyModel', + package_name+'/ReinforcementLearning'], data_files=[ ('share/ament_index/resource_index/packages', ['resource/' + package_name]), ('share/' + package_name, ['package.xml']), (os.path.join('share', package_name), glob('launch/*.launch.py')), ], - install_requires=['setuptools', 'numpy'], + install_requires=['setuptools'], zip_safe=True, maintainer='cpsfeith', maintainer_email='nikolaus.feith@unileoben.ac.at', @@ -23,7 +25,8 @@ setup( tests_require=['pytest'], entry_points={ 'console_scripts': [ - 'policy_srv = active_bo_ros.policy_service:main' + 'policy_srv = active_bo_ros.policy_service:main', + 'rl_action = active_bo_ros.rl_action_server:main' ], }, )