diff --git a/src/active_bo_msgs/msg/ActiveBORequest.msg b/src/active_bo_msgs/msg/ActiveBORequest.msg
index 7e63345..61c61ef 100644
--- a/src/active_bo_msgs/msg/ActiveBORequest.msg
+++ b/src/active_bo_msgs/msg/ActiveBORequest.msg
@@ -1,6 +1,8 @@
+string env
+string metric
 uint16 nr_weights
 uint16 max_steps
 uint16 nr_episodes
 uint16 nr_runs
 string acquisition_function
-float32 epsilon
\ No newline at end of file
+float32 metric_parameter
\ No newline at end of file
diff --git a/src/active_bo_msgs/msg/ActiveRL.msg b/src/active_bo_msgs/msg/ActiveRL.msg
index 60dca7b..23b086f 100644
--- a/src/active_bo_msgs/msg/ActiveRL.msg
+++ b/src/active_bo_msgs/msg/ActiveRL.msg
@@ -1,2 +1,3 @@
+string env
 float32[] policy
 float32[] weights
\ No newline at end of file
diff --git a/src/active_bo_msgs/srv/RLRollOut.srv b/src/active_bo_msgs/srv/RLRollOut.srv
index 00555f7..0581f5c 100644
--- a/src/active_bo_msgs/srv/RLRollOut.srv
+++ b/src/active_bo_msgs/srv/RLRollOut.srv
@@ -1,3 +1,4 @@
+string env
 float32[] policy
 ---
 float32 reward
diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
index 4102e17..16534b1 100644
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
@@ -47,9 +47,12 @@ class BayesianOptimization:
         env_reward = 0.0
         step_count = 0
 
+        self.env.reset()
+
         for i in range(len(policy)):
             action = policy[i]
-            output = self.env.step(action)
+            action_clipped = action.clip(min=-1.0, max=1.0)
+            output = self.env.step(action_clipped.astype(np.float32))
 
             env_reward += output[1]
             done = output[2]
diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py
index e69de29..a3d3b6e 100644
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Acrobot.py
@@ -0,0 +1,465 @@
+"""classic Acrobot task"""
+from typing import Optional
+
+import numpy as np
+from numpy import cos, pi, sin
+
+from gym import core, logger, spaces
+from gym.error import DependencyNotInstalled
+
+__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
+__credits__ = [
+    "Alborz Geramifard",
+    "Robert H. Klein",
+    "Christoph Dann",
+    "William Dabney",
+    "Jonathan P. How",
+]
+__license__ = "BSD 3-Clause"
+__author__ = "Christoph Dann <cdann@cdann.de>"
+
+# SOURCE:
+# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
+from gym.envs.classic_control import utils
+
+
+class AcrobotEnv(core.Env):
+    """
+    ### Description
+
+    The Acrobot environment is based on Sutton's work in
+    ["Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding"](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html)
+    and [Sutton and Barto's book](http://www.incompleteideas.net/book/the-book-2nd.html).
+    The system consists of two links connected linearly to form a chain, with one end of
+    the chain fixed. The joint between the two links is actuated. The goal is to apply
+    torques on the actuated joint to swing the free end of the linear chain above a
+    given height while starting from the initial state of hanging downwards.
+
+    As seen in the **Gif**: two blue links connected by two green joints. The joint in
+    between the two links is actuated. The goal is to swing the free end of the outer-link
+    to reach the target height (black horizontal line above system) by applying torque on
+    the actuator.
+
+    ### Action Space
+
+    The action is discrete, deterministic, and represents the torque applied on the actuated
+    joint between the two links.
+
+    | Num | Action                                | Unit         |
+    |-----|---------------------------------------|--------------|
+    | 0   | apply -1 torque to the actuated joint | torque (N m) |
+    | 1   | apply 0 torque to the actuated joint  | torque (N m) |
+    | 2   | apply 1 torque to the actuated joint  | torque (N m) |
+
+    ### Observation Space
+
+    The observation is a `ndarray` with shape `(6,)` that provides information about the
+    two rotational joint angles as well as their angular velocities:
+
+    | Num | Observation                  | Min                 | Max               |
+    |-----|------------------------------|---------------------|-------------------|
+    | 0   | Cosine of `theta1`           | -1                  | 1                 |
+    | 1   | Sine of `theta1`             | -1                  | 1                 |
+    | 2   | Cosine of `theta2`           | -1                  | 1                 |
+    | 3   | Sine of `theta2`             | -1                  | 1                 |
+    | 4   | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) |
+    | 5   | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) |
+
+    where
+    - `theta1` is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly
+    downwards.
+    - `theta2` is ***relative to the angle of the first link.***
+        An angle of 0 corresponds to having the same angle between the two links.
+
+    The angular velocities of `theta1` and `theta2` are bounded at ±4π, and ±9π rad/s respectively.
+    A state of `[1, 0, 1, 0, ..., ...]` indicates that both links are pointing downwards.
+
+    ### Rewards
+
+    The goal is to have the free end reach a designated target height in as few steps as possible,
+    and as such all steps that do not reach the goal incur a reward of -1.
+    Achieving the target height results in termination with a reward of 0. The reward threshold is -100.
+
+    ### Starting State
+
+    Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized
+    uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.
+
+    ### Episode End
+
+    The episode ends if one of the following occurs:
+    1. Termination: The free end reaches the target height, which is constructed as:
+    `-cos(theta1) - cos(theta2 + theta1) > 1.0`
+    2. Truncation: Episode length is greater than 500 (200 for v0)
+
+    ### Arguments
+
+    No additional arguments are currently supported.
+
+    ```
+    env = gym.make('Acrobot-v1')
+    ```
+
+    By default, the dynamics of the acrobot follow those described in Sutton and Barto's book
+    [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/11/node4.html).
+    However, a `book_or_nips` parameter can be modified to change the pendulum dynamics to those described
+    in the original [NeurIPS paper](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html).
+
+    ```
+    # To change the dynamics as described above
+    env.env.book_or_nips = 'nips'
+    ```
+
+    See the following note and
+    the [implementation](https://github.com/openai/gym/blob/master/gym/envs/classic_control/acrobot.py) for details:
+
+    > The dynamics equations were missing some terms in the NIPS paper which
+            are present in the book. R. Sutton confirmed in personal correspondence
+            that the experimental results shown in the paper and the book were
+            generated with the equations shown in the book.
+            However, there is the option to run the domain with the paper equations
+            by setting `book_or_nips = 'nips'`
+
+
+    ### Version History
+
+    - v1: Maximum number of steps increased from 200 to 500. The observation space for v0 provided direct readings of
+    `theta1` and `theta2` in radians, having a range of `[-pi, pi]`. The v1 observation space as described here provides the
+    sine and cosine of each angle instead.
+    - v0: Initial versions release (1.0.0) (removed from gym for v1)
+
+    ### References
+    - Sutton, R. S. (1996). Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding.
+        In D. Touretzky, M. C. Mozer, & M. Hasselmo (Eds.), Advances in Neural Information Processing Systems (Vol. 8).
+        MIT Press. https://proceedings.neurips.cc/paper/1995/file/8f1d43620bc6bb580df6e80b0dc05c48-Paper.pdf
+    - Sutton, R. S., Barto, A. G. (2018 ). Reinforcement Learning: An Introduction. The MIT Press.
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 15,
+    }
+
+    dt = 0.2
+
+    LINK_LENGTH_1 = 1.0  # [m]
+    LINK_LENGTH_2 = 1.0  # [m]
+    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
+    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
+    LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
+    LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
+    LINK_MOI = 1.0  #: moments of inertia for both links
+
+    MAX_VEL_1 = 4 * pi
+    MAX_VEL_2 = 9 * pi
+
+    AVAIL_TORQUE = [-1.0, 0.0, +1]
+
+    torque_noise_max = 0.0
+
+    SCREEN_DIM = 500
+
+    #: use dynamics equations from the nips paper or the book
+    book_or_nips = "book"
+    action_arrow = None
+    domain_fig = None
+    actions_num = 3
+
+    def __init__(self, render_mode: Optional[str] = None):
+        self.render_mode = render_mode
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+        high = np.array(
+            [1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
+        )
+        low = -high
+        self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
+        self.action_space = spaces.Discrete(3)
+        self.state = None
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        low, high = utils.maybe_parse_reset_bounds(
+            options, -0.1, 0.1  # default low
+        )  # default high
+        self.state = self.np_random.uniform(low=low, high=high, size=(4,)).astype(
+            np.float32
+        )
+
+        if self.render_mode == "human":
+            self.render()
+        return self._get_ob(), {}
+
+    def step(self, a):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        torque = self.AVAIL_TORQUE[a]
+
+        # Add noise to the force action
+        if self.torque_noise_max > 0:
+            torque += self.np_random.uniform(
+                -self.torque_noise_max, self.torque_noise_max
+            )
+
+        # Now, augment the state with our force action so it can be passed to
+        # _dsdt
+        s_augmented = np.append(s, torque)
+
+        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
+
+        ns[0] = wrap(ns[0], -pi, pi)
+        ns[1] = wrap(ns[1], -pi, pi)
+        ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
+        ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
+        self.state = ns
+        terminated = self._terminal()
+        reward = -1.0 if not terminated else 0.0
+
+        if self.render_mode == "human":
+            self.render()
+        return (self._get_ob(), reward, terminated, False, {})
+
+    def _get_ob(self):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        return np.array(
+            [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32
+        )
+
+    def _terminal(self):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)
+
+    def _dsdt(self, s_augmented):
+        m1 = self.LINK_MASS_1
+        m2 = self.LINK_MASS_2
+        l1 = self.LINK_LENGTH_1
+        lc1 = self.LINK_COM_POS_1
+        lc2 = self.LINK_COM_POS_2
+        I1 = self.LINK_MOI
+        I2 = self.LINK_MOI
+        g = 9.8
+        a = s_augmented[-1]
+        s = s_augmented[:-1]
+        theta1 = s[0]
+        theta2 = s[1]
+        dtheta1 = s[2]
+        dtheta2 = s[3]
+        d1 = (
+            m1 * lc1**2
+            + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2))
+            + I1
+            + I2
+        )
+        d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
+        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
+        phi1 = (
+            -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
+            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
+            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
+            + phi2
+        )
+        if self.book_or_nips == "nips":
+            # the following line is consistent with the description in the
+            # paper
+            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1)
+        else:
+            # the following line is consistent with the java implementation and the
+            # book
+            ddtheta2 = (
+                a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
+            ) / (m2 * lc2**2 + I2 - d2**2 / d1)
+        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
+        return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
+
+    def render(self):
+        if self.render_mode is None:
+            logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gym[classic_control]`"
+            )
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.SCREEN_DIM, self.SCREEN_DIM)
+                )
+            else:  # mode in "rgb_array"
+                self.screen = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        surf = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
+        surf.fill((255, 255, 255))
+        s = self.state
+
+        bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2  # 2.2 for default
+        scale = self.SCREEN_DIM / (bound * 2)
+        offset = self.SCREEN_DIM / 2
+
+        if s is None:
+            return None
+
+        p1 = [
+            -self.LINK_LENGTH_1 * cos(s[0]) * scale,
+            self.LINK_LENGTH_1 * sin(s[0]) * scale,
+        ]
+
+        p2 = [
+            p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]) * scale,
+            p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]) * scale,
+        ]
+
+        xys = np.array([[0, 0], p1, p2])[:, ::-1]
+        thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2]
+        link_lengths = [self.LINK_LENGTH_1 * scale, self.LINK_LENGTH_2 * scale]
+
+        pygame.draw.line(
+            surf,
+            start_pos=(-2.2 * scale + offset, 1 * scale + offset),
+            end_pos=(2.2 * scale + offset, 1 * scale + offset),
+            color=(0, 0, 0),
+        )
+
+        for ((x, y), th, llen) in zip(xys, thetas, link_lengths):
+            x = x + offset
+            y = y + offset
+            l, r, t, b = 0, llen, 0.1 * scale, -0.1 * scale
+            coords = [(l, b), (l, t), (r, t), (r, b)]
+            transformed_coords = []
+            for coord in coords:
+                coord = pygame.math.Vector2(coord).rotate_rad(th)
+                coord = (coord[0] + x, coord[1] + y)
+                transformed_coords.append(coord)
+            gfxdraw.aapolygon(surf, transformed_coords, (0, 204, 204))
+            gfxdraw.filled_polygon(surf, transformed_coords, (0, 204, 204))
+
+            gfxdraw.aacircle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
+            gfxdraw.filled_circle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
+
+        surf = pygame.transform.flip(surf, False, True)
+        self.screen.blit(surf, (0, 0))
+
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+def wrap(x, m, M):
+    """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
+    truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
+    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
+
+    Args:
+        x: a scalar
+        m: minimum possible value in range
+        M: maximum possible value in range
+
+    Returns:
+        x: a scalar, wrapped
+    """
+    diff = M - m
+    while x > M:
+        x = x - diff
+    while x < m:
+        x = x + diff
+    return x
+
+
+def bound(x, m, M=None):
+    """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
+    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
+
+    Args:
+        x: scalar
+        m: The lower bound
+        M: The upper bound
+
+    Returns:
+        x: scalar, bound between min (m) and Max (M)
+    """
+    if M is None:
+        M = m[1]
+        m = m[0]
+    # bound x between min (m) and Max (M)
+    return min(max(x, m), M)
+
+
+def rk4(derivs, y0, t):
+    """
+    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
+
+    Example for 2D system:
+
+        >>> def derivs(x):
+        ...     d1 =  x[0] + 2*x[1]
+        ...     d2 =  -3*x[0] + 4*x[1]
+        ...     return d1, d2
+
+        >>> dt = 0.0005
+        >>> t = np.arange(0.0, 2.0, dt)
+        >>> y0 = (1,2)
+        >>> yout = rk4(derivs, y0, t)
+
+    Args:
+        derivs: the derivative of the system and has the signature ``dy = derivs(yi)``
+        y0: initial state vector
+        t: sample times
+
+    Returns:
+        yout: Runge-Kutta approximation of the ODE
+    """
+
+    try:
+        Ny = len(y0)
+    except TypeError:
+        yout = np.zeros((len(t),), np.float_)
+    else:
+        yout = np.zeros((len(t), Ny), np.float_)
+
+    yout[0] = y0
+
+    for i in np.arange(len(t) - 1):
+
+        this = t[i]
+        dt = t[i + 1] - this
+        dt2 = dt / 2.0
+        y0 = yout[i]
+
+        k1 = np.asarray(derivs(y0))
+        k2 = np.asarray(derivs(y0 + dt2 * k1))
+        k3 = np.asarray(derivs(y0 + dt2 * k2))
+        k4 = np.asarray(derivs(y0 + dt * k3))
+        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
+    # We only care about the final timestep and we cleave off action value which will be zero
+    return yout[-1][:4]
\ No newline at end of file
diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
index 9f472dc..44d6ca0 100644
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
@@ -123,9 +123,9 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
         self.steps_beyond_terminated = None
 
     def step(self, action):
-        err_msg = f"{action!r} ({type(action)}) invalid"
-        assert self.action_space.contains(action), err_msg
-        assert self.state is not None, "Call reset before using step method."
+        # err_msg = f"{action!r} ({type(action)}) invalid"
+        # assert self.action_space.contains(action), err_msg
+        # assert self.state is not None, "Call reset before using step method."
         x, x_dot, theta, theta_dot = self.state
         # changed usage of action due to policy shaping approach
         force = action * self.force_mag
@@ -153,7 +153,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
             theta_dot = theta_dot + self.tau * thetaacc
             theta = theta + self.tau * theta_dot
 
-        self.state = (x, x_dot[0], theta, theta_dot[0])
+        self.state = (x, x_dot, theta, theta_dot)
 
         terminated = bool(
             x < -self.x_threshold
@@ -181,7 +181,6 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
 
         if self.render_mode == "human":
             self.render()
-
         return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
 
     def reset(
diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py
index 758e676..79b7665 100644
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/Pendulum.py
@@ -126,7 +126,7 @@ class PendulumEnv(gym.Env):
 
         u = 2 * u # scaling the action to +/- 2 Nm
 
-        u = np.clip(u, -self.max_torque, self.max_torque)[0]
+        u = np.clip(u, -self.max_torque, self.max_torque)
         self.last_u = u  # for rendering
         costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
 
diff --git a/src/active_bo_ros/active_bo_ros/active_bo_topic.py b/src/active_bo_ros/active_bo_ros/active_bo_topic.py
index 161033a..5dc1a2f 100644
--- a/src/active_bo_ros/active_bo_ros/active_bo_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_bo_topic.py
@@ -9,7 +9,11 @@ from rclpy.node import Node
 from rclpy.callback_groups import ReentrantCallbackGroup
 
 from active_bo_ros.BayesianOptimization.BayesianOptimization import BayesianOptimization
+
 from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
+from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
+from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
+from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
 
 import numpy as np
 import time
@@ -34,12 +38,13 @@ class ActiveBOTopic(Node):
                                                       1, callback_group=bo_callback_group)
 
         self.active_bo_pending = False
+        self.bo_env = None
         self.bo_nr_weights = None
         self.bo_steps = None
         self.bo_episodes = None
         self.bo_runs = None
         self.bo_acq_fcn = None
-        self.bo_epsilon = None
+        self.bo_metric_parameter = None
         self.current_run = 0
         self.current_episode = 0
 
@@ -58,7 +63,7 @@ class ActiveBOTopic(Node):
         self.rl_reward = None
 
         # RL Environments and BO
-        self.env = Continuous_MountainCarEnv()
+        self.env = None
         self.distance_penalty = 0
 
         self.BO = None
@@ -75,12 +80,13 @@ class ActiveBOTopic(Node):
                                           callback_group=mainloop_callback_group)
 
     def reset_bo_request(self):
+        self.bo_env = None
         self.bo_nr_weights = None
         self.bo_steps = None
         self.bo_episodes = None
         self.bo_runs = None
         self.bo_acq_fcn = None
-        self.bo_epsilon = None
+        self.bo_metric_parameter = None
         self.current_run = 0
         self.current_episode = 0
 
@@ -88,12 +94,13 @@ class ActiveBOTopic(Node):
         if not self.active_bo_pending:
             self.get_logger().info('Active Bayesian Optimization request pending!')
             self.active_bo_pending = True
+            self.bo_env = msg.env
             self.bo_nr_weights = msg.nr_weights
             self.bo_steps = msg.max_steps
             self.bo_episodes = msg.nr_episodes
             self.bo_runs = msg.nr_runs
             self.bo_acq_fcn = msg.acquisition_function
-            self.bo_epsilon = msg.epsilon
+            self.bo_metric_parameter = msg.metric_parameter
 
             # initialize
             self.reward = np.zeros((self.bo_episodes, self.bo_runs))
@@ -116,6 +123,18 @@ class ActiveBOTopic(Node):
 
     def mainloop_callback(self):
         if self.active_bo_pending:
+            # set rl environment
+            if self.bo_env == "Mountain Car":
+                self.env = Continuous_MountainCarEnv()
+            elif self.bo_env == "Cartpole":
+                self.env = CartPoleEnv()
+            elif self.bo_env == "Acrobot":
+                self.env = AcrobotEnv()
+            elif self.bo_env == "Pendulum":
+                self.env = PendulumEnv()
+            else:
+                raise NotImplementedError
+
             if self.BO is None:
                 self.BO = BayesianOptimization(self.env,
                                                self.bo_steps,
@@ -139,6 +158,7 @@ class ActiveBOTopic(Node):
                 self.active_bo_pub.publish(bo_response)
                 self.reset_bo_request()
                 self.active_bo_pending = False
+                self.BO = None
 
             else:
                 if self.active_rl_pending:
@@ -151,10 +171,11 @@ class ActiveBOTopic(Node):
                         self.get_logger().error(f'Active Reinforcement Learning failed to add new observation: {e}')
                 else:
                     if self.current_episode < self.bo_episodes:
-                        if np.random.uniform(0.0, 1.0, 1) < self.bo_epsilon:
+                        if np.random.uniform(0.0, 1.0, 1) < self.bo_metric_parameter:
                             active_rl_request = ActiveRL()
                             old_policy, _, old_weights = self.BO.get_best_result()
 
+                            active_rl_request.env = self.bo_env
                             active_rl_request.policy = old_policy.tolist()
                             active_rl_request.weights = old_weights.tolist()
 
diff --git a/src/active_bo_ros/active_bo_ros/active_rl_topic.py b/src/active_bo_ros/active_bo_ros/active_rl_topic.py
index 535ce65..582d7a6 100644
--- a/src/active_bo_ros/active_bo_ros/active_rl_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_rl_topic.py
@@ -9,6 +9,10 @@ from rclpy.node import Node
 from rclpy.callback_groups import ReentrantCallbackGroup
 
 from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
+from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
+from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
+from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
+
 
 import numpy as np
 import time
@@ -32,6 +36,7 @@ class ActiveRLService(Node):
                                                       1, callback_group=rl_callback_group)
 
         self.active_rl_pending = False
+        self.rl_env = None
         self.rl_policy = None
         self.rl_weights = None
         self.rl_reward = 0.0
@@ -58,7 +63,8 @@ class ActiveRLService(Node):
         self.eval_weights = None
 
         # RL Environments
-        self.env = Continuous_MountainCarEnv(render_mode='rgb_array')
+        self.env = None
+
         self.distance_penalty = 0
         self.best_pol_shown = False
 
@@ -69,13 +75,26 @@ class ActiveRLService(Node):
                                           callback_group=mainloop_callback_group)
 
     def reset_rl_request(self):
+        self.rl_env = None
         self.rl_policy = None
         self.rl_weights = None
 
     def active_rl_callback(self, msg):
-        self.rl_policy = msg.policy
+        self.rl_env = msg.env
+        self.rl_policy = np.array(msg.policy, dtype=np.float32)
         self.rl_weights = msg.weights
 
+        if self.rl_env == "Mountain Car":
+            self.env = Continuous_MountainCarEnv(render_mode="rgb_array")
+        elif self.rl_env == "Cartpole":
+            self.env = CartPoleEnv(render_mode="rgb_array")
+        elif self.rl_env == "Acrobot":
+            self.env = AcrobotEnv(render_mode="rgb_array")
+        elif self.rl_env == "Pendulum":
+            self.env = PendulumEnv(render_mode="rgb_array")
+        else:
+            raise NotImplementedError
+
         self.get_logger().info('Active RL: Called!')
         self.env.reset()
         self.active_rl_pending = True
@@ -85,15 +104,20 @@ class ActiveRLService(Node):
         self.eval_weights = None
 
     def active_rl_eval_callback(self, msg):
-        self.eval_policy = msg.policy
+        self.eval_policy = np.array(msg.policy, dtype=np.float32)
         self.eval_weights = msg.weights
 
         self.get_logger().info('Active RL Eval: Responded!')
+        self.env.reset()
         self.eval_response_received = True
 
     def next_image(self, policy):
         action = policy[self.rl_step]
-        output = self.env.step(action)
+        action_clipped = action.clip(min=-1.0, max=1.0)
+        self.get_logger().info(str(action_clipped) + str(type(action_clipped)))
+        output = self.env.step(action_clipped.astype(np.float32))
+
+        self.get_logger().info(str(output))
 
         self.rl_reward += output[1]
         done = output[2]
@@ -133,7 +157,7 @@ class ActiveRLService(Node):
                     self.rl_reward = 0.0
 
                     eval_request = ActiveRL()
-                    eval_request.policy = self.rl_policy
+                    eval_request.policy = self.rl_policy.tolist()
                     eval_request.weights = self.rl_weights
 
                     self.eval_pub.publish(eval_request)
diff --git a/src/active_bo_ros/active_bo_ros/rl_service.py b/src/active_bo_ros/active_bo_ros/rl_service.py
index 67f26f6..e259384 100644
--- a/src/active_bo_ros/active_bo_ros/rl_service.py
+++ b/src/active_bo_ros/active_bo_ros/rl_service.py
@@ -5,6 +5,9 @@ import rclpy
 from rclpy.node import Node
 
 from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
+from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
+from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
+from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
 
 import numpy as np
 
@@ -16,7 +19,7 @@ class RLService(Node):
 
         self.publisher = self.create_publisher(ImageFeedback, 'rl_feedback', 1)
 
-        self.env = Continuous_MountainCarEnv(render_mode='rgb_array')
+        self.env = None
         self.distance_penalty = 0
 
     def rl_callback(self, request, response):
@@ -25,13 +28,27 @@ class RLService(Node):
 
         reward = 0
         step_count = 0
-        policy = request.policy
+        policy = np.array(request.policy, dtype=np.float32)
+        rl_env = request.env
+
+        if rl_env == "Mountain Car":
+            self.env = Continuous_MountainCarEnv(render_mode="rgb_array")
+        elif rl_env == "Cartpole":
+            self.env = CartPoleEnv(render_mode="rgb_array")
+        elif rl_env == "Acrobot":
+            self.env = AcrobotEnv(render_mode="rgb_array")
+        elif rl_env == "Pendulum":
+            self.env = PendulumEnv(render_mode="rgb_array")
+        else:
+            raise NotImplementedError
 
         self.env.reset()
 
         for i in range(len(policy)):
             action = policy[i]
-            output = self.env.step(action)
+            action_clipped = action.clip(min=-1.0, max=1.0)
+            self.get_logger().info(str(action_clipped) + str(type(action_clipped)))
+            output = self.env.step(action_clipped.astype(np.float32))
 
             reward += output[1]
             done = output[2]
@@ -64,6 +81,7 @@ class RLService(Node):
 
         return response
 
+
 def main(args=None):
     rclpy.init(args=args)
 
@@ -73,5 +91,6 @@ def main(args=None):
 
     rclpy.shutdown()
 
+
 if __name__ == '__main__':
     main()