Integration of all but acrobot envs in interface + adapting msgs and srvs
This commit is contained in:
parent
a101b2f53b
commit
7ea78d5e0f
@ -1,6 +1,8 @@
|
|||||||
|
string env
|
||||||
|
string metric
|
||||||
uint16 nr_weights
|
uint16 nr_weights
|
||||||
uint16 max_steps
|
uint16 max_steps
|
||||||
uint16 nr_episodes
|
uint16 nr_episodes
|
||||||
uint16 nr_runs
|
uint16 nr_runs
|
||||||
string acquisition_function
|
string acquisition_function
|
||||||
float32 epsilon
|
float32 metric_parameter
|
@ -1,2 +1,3 @@
|
|||||||
|
string env
|
||||||
float32[] policy
|
float32[] policy
|
||||||
float32[] weights
|
float32[] weights
|
@ -1,3 +1,4 @@
|
|||||||
|
string env
|
||||||
float32[] policy
|
float32[] policy
|
||||||
---
|
---
|
||||||
float32 reward
|
float32 reward
|
||||||
|
@ -47,9 +47,12 @@ class BayesianOptimization:
|
|||||||
env_reward = 0.0
|
env_reward = 0.0
|
||||||
step_count = 0
|
step_count = 0
|
||||||
|
|
||||||
|
self.env.reset()
|
||||||
|
|
||||||
for i in range(len(policy)):
|
for i in range(len(policy)):
|
||||||
action = policy[i]
|
action = policy[i]
|
||||||
output = self.env.step(action)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
|
output = self.env.step(action_clipped.astype(np.float32))
|
||||||
|
|
||||||
env_reward += output[1]
|
env_reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
|
@ -0,0 +1,465 @@
|
|||||||
|
"""classic Acrobot task"""
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy import cos, pi, sin
|
||||||
|
|
||||||
|
from gym import core, logger, spaces
|
||||||
|
from gym.error import DependencyNotInstalled
|
||||||
|
|
||||||
|
__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
|
||||||
|
__credits__ = [
|
||||||
|
"Alborz Geramifard",
|
||||||
|
"Robert H. Klein",
|
||||||
|
"Christoph Dann",
|
||||||
|
"William Dabney",
|
||||||
|
"Jonathan P. How",
|
||||||
|
]
|
||||||
|
__license__ = "BSD 3-Clause"
|
||||||
|
__author__ = "Christoph Dann <cdann@cdann.de>"
|
||||||
|
|
||||||
|
# SOURCE:
|
||||||
|
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
|
||||||
|
from gym.envs.classic_control import utils
|
||||||
|
|
||||||
|
|
||||||
|
class AcrobotEnv(core.Env):
|
||||||
|
"""
|
||||||
|
### Description
|
||||||
|
|
||||||
|
The Acrobot environment is based on Sutton's work in
|
||||||
|
["Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding"](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html)
|
||||||
|
and [Sutton and Barto's book](http://www.incompleteideas.net/book/the-book-2nd.html).
|
||||||
|
The system consists of two links connected linearly to form a chain, with one end of
|
||||||
|
the chain fixed. The joint between the two links is actuated. The goal is to apply
|
||||||
|
torques on the actuated joint to swing the free end of the linear chain above a
|
||||||
|
given height while starting from the initial state of hanging downwards.
|
||||||
|
|
||||||
|
As seen in the **Gif**: two blue links connected by two green joints. The joint in
|
||||||
|
between the two links is actuated. The goal is to swing the free end of the outer-link
|
||||||
|
to reach the target height (black horizontal line above system) by applying torque on
|
||||||
|
the actuator.
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
|
||||||
|
The action is discrete, deterministic, and represents the torque applied on the actuated
|
||||||
|
joint between the two links.
|
||||||
|
|
||||||
|
| Num | Action | Unit |
|
||||||
|
|-----|---------------------------------------|--------------|
|
||||||
|
| 0 | apply -1 torque to the actuated joint | torque (N m) |
|
||||||
|
| 1 | apply 0 torque to the actuated joint | torque (N m) |
|
||||||
|
| 2 | apply 1 torque to the actuated joint | torque (N m) |
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
|
||||||
|
The observation is a `ndarray` with shape `(6,)` that provides information about the
|
||||||
|
two rotational joint angles as well as their angular velocities:
|
||||||
|
|
||||||
|
| Num | Observation | Min | Max |
|
||||||
|
|-----|------------------------------|---------------------|-------------------|
|
||||||
|
| 0 | Cosine of `theta1` | -1 | 1 |
|
||||||
|
| 1 | Sine of `theta1` | -1 | 1 |
|
||||||
|
| 2 | Cosine of `theta2` | -1 | 1 |
|
||||||
|
| 3 | Sine of `theta2` | -1 | 1 |
|
||||||
|
| 4 | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) |
|
||||||
|
| 5 | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) |
|
||||||
|
|
||||||
|
where
|
||||||
|
- `theta1` is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly
|
||||||
|
downwards.
|
||||||
|
- `theta2` is ***relative to the angle of the first link.***
|
||||||
|
An angle of 0 corresponds to having the same angle between the two links.
|
||||||
|
|
||||||
|
The angular velocities of `theta1` and `theta2` are bounded at ±4π, and ±9π rad/s respectively.
|
||||||
|
A state of `[1, 0, 1, 0, ..., ...]` indicates that both links are pointing downwards.
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
|
||||||
|
The goal is to have the free end reach a designated target height in as few steps as possible,
|
||||||
|
and as such all steps that do not reach the goal incur a reward of -1.
|
||||||
|
Achieving the target height results in termination with a reward of 0. The reward threshold is -100.
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
|
||||||
|
Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized
|
||||||
|
uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.
|
||||||
|
|
||||||
|
### Episode End
|
||||||
|
|
||||||
|
The episode ends if one of the following occurs:
|
||||||
|
1. Termination: The free end reaches the target height, which is constructed as:
|
||||||
|
`-cos(theta1) - cos(theta2 + theta1) > 1.0`
|
||||||
|
2. Truncation: Episode length is greater than 500 (200 for v0)
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
No additional arguments are currently supported.
|
||||||
|
|
||||||
|
```
|
||||||
|
env = gym.make('Acrobot-v1')
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, the dynamics of the acrobot follow those described in Sutton and Barto's book
|
||||||
|
[Reinforcement Learning: An Introduction](http://incompleteideas.net/book/11/node4.html).
|
||||||
|
However, a `book_or_nips` parameter can be modified to change the pendulum dynamics to those described
|
||||||
|
in the original [NeurIPS paper](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html).
|
||||||
|
|
||||||
|
```
|
||||||
|
# To change the dynamics as described above
|
||||||
|
env.env.book_or_nips = 'nips'
|
||||||
|
```
|
||||||
|
|
||||||
|
See the following note and
|
||||||
|
the [implementation](https://github.com/openai/gym/blob/master/gym/envs/classic_control/acrobot.py) for details:
|
||||||
|
|
||||||
|
> The dynamics equations were missing some terms in the NIPS paper which
|
||||||
|
are present in the book. R. Sutton confirmed in personal correspondence
|
||||||
|
that the experimental results shown in the paper and the book were
|
||||||
|
generated with the equations shown in the book.
|
||||||
|
However, there is the option to run the domain with the paper equations
|
||||||
|
by setting `book_or_nips = 'nips'`
|
||||||
|
|
||||||
|
|
||||||
|
### Version History
|
||||||
|
|
||||||
|
- v1: Maximum number of steps increased from 200 to 500. The observation space for v0 provided direct readings of
|
||||||
|
`theta1` and `theta2` in radians, having a range of `[-pi, pi]`. The v1 observation space as described here provides the
|
||||||
|
sine and cosine of each angle instead.
|
||||||
|
- v0: Initial versions release (1.0.0) (removed from gym for v1)
|
||||||
|
|
||||||
|
### References
|
||||||
|
- Sutton, R. S. (1996). Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding.
|
||||||
|
In D. Touretzky, M. C. Mozer, & M. Hasselmo (Eds.), Advances in Neural Information Processing Systems (Vol. 8).
|
||||||
|
MIT Press. https://proceedings.neurips.cc/paper/1995/file/8f1d43620bc6bb580df6e80b0dc05c48-Paper.pdf
|
||||||
|
- Sutton, R. S., Barto, A. G. (2018 ). Reinforcement Learning: An Introduction. The MIT Press.
|
||||||
|
"""
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"render_modes": ["human", "rgb_array"],
|
||||||
|
"render_fps": 15,
|
||||||
|
}
|
||||||
|
|
||||||
|
dt = 0.2
|
||||||
|
|
||||||
|
LINK_LENGTH_1 = 1.0 # [m]
|
||||||
|
LINK_LENGTH_2 = 1.0 # [m]
|
||||||
|
LINK_MASS_1 = 1.0 #: [kg] mass of link 1
|
||||||
|
LINK_MASS_2 = 1.0 #: [kg] mass of link 2
|
||||||
|
LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1
|
||||||
|
LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2
|
||||||
|
LINK_MOI = 1.0 #: moments of inertia for both links
|
||||||
|
|
||||||
|
MAX_VEL_1 = 4 * pi
|
||||||
|
MAX_VEL_2 = 9 * pi
|
||||||
|
|
||||||
|
AVAIL_TORQUE = [-1.0, 0.0, +1]
|
||||||
|
|
||||||
|
torque_noise_max = 0.0
|
||||||
|
|
||||||
|
SCREEN_DIM = 500
|
||||||
|
|
||||||
|
#: use dynamics equations from the nips paper or the book
|
||||||
|
book_or_nips = "book"
|
||||||
|
action_arrow = None
|
||||||
|
domain_fig = None
|
||||||
|
actions_num = 3
|
||||||
|
|
||||||
|
def __init__(self, render_mode: Optional[str] = None):
|
||||||
|
self.render_mode = render_mode
|
||||||
|
self.screen = None
|
||||||
|
self.clock = None
|
||||||
|
self.isopen = True
|
||||||
|
high = np.array(
|
||||||
|
[1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
|
||||||
|
)
|
||||||
|
low = -high
|
||||||
|
self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
|
||||||
|
self.action_space = spaces.Discrete(3)
|
||||||
|
self.state = None
|
||||||
|
|
||||||
|
def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
|
||||||
|
super().reset(seed=seed)
|
||||||
|
# Note that if you use custom reset bounds, it may lead to out-of-bound
|
||||||
|
# state/observations.
|
||||||
|
low, high = utils.maybe_parse_reset_bounds(
|
||||||
|
options, -0.1, 0.1 # default low
|
||||||
|
) # default high
|
||||||
|
self.state = self.np_random.uniform(low=low, high=high, size=(4,)).astype(
|
||||||
|
np.float32
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.render_mode == "human":
|
||||||
|
self.render()
|
||||||
|
return self._get_ob(), {}
|
||||||
|
|
||||||
|
def step(self, a):
|
||||||
|
s = self.state
|
||||||
|
assert s is not None, "Call reset before using AcrobotEnv object."
|
||||||
|
torque = self.AVAIL_TORQUE[a]
|
||||||
|
|
||||||
|
# Add noise to the force action
|
||||||
|
if self.torque_noise_max > 0:
|
||||||
|
torque += self.np_random.uniform(
|
||||||
|
-self.torque_noise_max, self.torque_noise_max
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now, augment the state with our force action so it can be passed to
|
||||||
|
# _dsdt
|
||||||
|
s_augmented = np.append(s, torque)
|
||||||
|
|
||||||
|
ns = rk4(self._dsdt, s_augmented, [0, self.dt])
|
||||||
|
|
||||||
|
ns[0] = wrap(ns[0], -pi, pi)
|
||||||
|
ns[1] = wrap(ns[1], -pi, pi)
|
||||||
|
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
|
||||||
|
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
|
||||||
|
self.state = ns
|
||||||
|
terminated = self._terminal()
|
||||||
|
reward = -1.0 if not terminated else 0.0
|
||||||
|
|
||||||
|
if self.render_mode == "human":
|
||||||
|
self.render()
|
||||||
|
return (self._get_ob(), reward, terminated, False, {})
|
||||||
|
|
||||||
|
def _get_ob(self):
|
||||||
|
s = self.state
|
||||||
|
assert s is not None, "Call reset before using AcrobotEnv object."
|
||||||
|
return np.array(
|
||||||
|
[cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32
|
||||||
|
)
|
||||||
|
|
||||||
|
def _terminal(self):
|
||||||
|
s = self.state
|
||||||
|
assert s is not None, "Call reset before using AcrobotEnv object."
|
||||||
|
return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)
|
||||||
|
|
||||||
|
def _dsdt(self, s_augmented):
|
||||||
|
m1 = self.LINK_MASS_1
|
||||||
|
m2 = self.LINK_MASS_2
|
||||||
|
l1 = self.LINK_LENGTH_1
|
||||||
|
lc1 = self.LINK_COM_POS_1
|
||||||
|
lc2 = self.LINK_COM_POS_2
|
||||||
|
I1 = self.LINK_MOI
|
||||||
|
I2 = self.LINK_MOI
|
||||||
|
g = 9.8
|
||||||
|
a = s_augmented[-1]
|
||||||
|
s = s_augmented[:-1]
|
||||||
|
theta1 = s[0]
|
||||||
|
theta2 = s[1]
|
||||||
|
dtheta1 = s[2]
|
||||||
|
dtheta2 = s[3]
|
||||||
|
d1 = (
|
||||||
|
m1 * lc1**2
|
||||||
|
+ m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2))
|
||||||
|
+ I1
|
||||||
|
+ I2
|
||||||
|
)
|
||||||
|
d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
|
||||||
|
phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
|
||||||
|
phi1 = (
|
||||||
|
-m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
|
||||||
|
- 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
|
||||||
|
+ (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
|
||||||
|
+ phi2
|
||||||
|
)
|
||||||
|
if self.book_or_nips == "nips":
|
||||||
|
# the following line is consistent with the description in the
|
||||||
|
# paper
|
||||||
|
ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1)
|
||||||
|
else:
|
||||||
|
# the following line is consistent with the java implementation and the
|
||||||
|
# book
|
||||||
|
ddtheta2 = (
|
||||||
|
a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
|
||||||
|
) / (m2 * lc2**2 + I2 - d2**2 / d1)
|
||||||
|
ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
|
||||||
|
return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
if self.render_mode is None:
|
||||||
|
logger.warn(
|
||||||
|
"You are calling render method without specifying any render mode. "
|
||||||
|
"You can specify the render_mode at initialization, "
|
||||||
|
f'e.g. gym("{self.spec.id}", render_mode="rgb_array")'
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pygame
|
||||||
|
from pygame import gfxdraw
|
||||||
|
except ImportError:
|
||||||
|
raise DependencyNotInstalled(
|
||||||
|
"pygame is not installed, run `pip install gym[classic_control]`"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.screen is None:
|
||||||
|
pygame.init()
|
||||||
|
if self.render_mode == "human":
|
||||||
|
pygame.display.init()
|
||||||
|
self.screen = pygame.display.set_mode(
|
||||||
|
(self.SCREEN_DIM, self.SCREEN_DIM)
|
||||||
|
)
|
||||||
|
else: # mode in "rgb_array"
|
||||||
|
self.screen = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
|
||||||
|
if self.clock is None:
|
||||||
|
self.clock = pygame.time.Clock()
|
||||||
|
|
||||||
|
surf = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
|
||||||
|
surf.fill((255, 255, 255))
|
||||||
|
s = self.state
|
||||||
|
|
||||||
|
bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default
|
||||||
|
scale = self.SCREEN_DIM / (bound * 2)
|
||||||
|
offset = self.SCREEN_DIM / 2
|
||||||
|
|
||||||
|
if s is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
p1 = [
|
||||||
|
-self.LINK_LENGTH_1 * cos(s[0]) * scale,
|
||||||
|
self.LINK_LENGTH_1 * sin(s[0]) * scale,
|
||||||
|
]
|
||||||
|
|
||||||
|
p2 = [
|
||||||
|
p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]) * scale,
|
||||||
|
p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]) * scale,
|
||||||
|
]
|
||||||
|
|
||||||
|
xys = np.array([[0, 0], p1, p2])[:, ::-1]
|
||||||
|
thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2]
|
||||||
|
link_lengths = [self.LINK_LENGTH_1 * scale, self.LINK_LENGTH_2 * scale]
|
||||||
|
|
||||||
|
pygame.draw.line(
|
||||||
|
surf,
|
||||||
|
start_pos=(-2.2 * scale + offset, 1 * scale + offset),
|
||||||
|
end_pos=(2.2 * scale + offset, 1 * scale + offset),
|
||||||
|
color=(0, 0, 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
for ((x, y), th, llen) in zip(xys, thetas, link_lengths):
|
||||||
|
x = x + offset
|
||||||
|
y = y + offset
|
||||||
|
l, r, t, b = 0, llen, 0.1 * scale, -0.1 * scale
|
||||||
|
coords = [(l, b), (l, t), (r, t), (r, b)]
|
||||||
|
transformed_coords = []
|
||||||
|
for coord in coords:
|
||||||
|
coord = pygame.math.Vector2(coord).rotate_rad(th)
|
||||||
|
coord = (coord[0] + x, coord[1] + y)
|
||||||
|
transformed_coords.append(coord)
|
||||||
|
gfxdraw.aapolygon(surf, transformed_coords, (0, 204, 204))
|
||||||
|
gfxdraw.filled_polygon(surf, transformed_coords, (0, 204, 204))
|
||||||
|
|
||||||
|
gfxdraw.aacircle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
|
||||||
|
gfxdraw.filled_circle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
|
||||||
|
|
||||||
|
surf = pygame.transform.flip(surf, False, True)
|
||||||
|
self.screen.blit(surf, (0, 0))
|
||||||
|
|
||||||
|
if self.render_mode == "human":
|
||||||
|
pygame.event.pump()
|
||||||
|
self.clock.tick(self.metadata["render_fps"])
|
||||||
|
pygame.display.flip()
|
||||||
|
|
||||||
|
elif self.render_mode == "rgb_array":
|
||||||
|
return np.transpose(
|
||||||
|
np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if self.screen is not None:
|
||||||
|
import pygame
|
||||||
|
|
||||||
|
pygame.display.quit()
|
||||||
|
pygame.quit()
|
||||||
|
self.isopen = False
|
||||||
|
|
||||||
|
|
||||||
|
def wrap(x, m, M):
|
||||||
|
"""Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
|
||||||
|
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
|
||||||
|
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: a scalar
|
||||||
|
m: minimum possible value in range
|
||||||
|
M: maximum possible value in range
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
x: a scalar, wrapped
|
||||||
|
"""
|
||||||
|
diff = M - m
|
||||||
|
while x > M:
|
||||||
|
x = x - diff
|
||||||
|
while x < m:
|
||||||
|
x = x + diff
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def bound(x, m, M=None):
|
||||||
|
"""Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
|
||||||
|
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: scalar
|
||||||
|
m: The lower bound
|
||||||
|
M: The upper bound
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
x: scalar, bound between min (m) and Max (M)
|
||||||
|
"""
|
||||||
|
if M is None:
|
||||||
|
M = m[1]
|
||||||
|
m = m[0]
|
||||||
|
# bound x between min (m) and Max (M)
|
||||||
|
return min(max(x, m), M)
|
||||||
|
|
||||||
|
|
||||||
|
def rk4(derivs, y0, t):
|
||||||
|
"""
|
||||||
|
Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
|
||||||
|
|
||||||
|
Example for 2D system:
|
||||||
|
|
||||||
|
>>> def derivs(x):
|
||||||
|
... d1 = x[0] + 2*x[1]
|
||||||
|
... d2 = -3*x[0] + 4*x[1]
|
||||||
|
... return d1, d2
|
||||||
|
|
||||||
|
>>> dt = 0.0005
|
||||||
|
>>> t = np.arange(0.0, 2.0, dt)
|
||||||
|
>>> y0 = (1,2)
|
||||||
|
>>> yout = rk4(derivs, y0, t)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
derivs: the derivative of the system and has the signature ``dy = derivs(yi)``
|
||||||
|
y0: initial state vector
|
||||||
|
t: sample times
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
yout: Runge-Kutta approximation of the ODE
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
Ny = len(y0)
|
||||||
|
except TypeError:
|
||||||
|
yout = np.zeros((len(t),), np.float_)
|
||||||
|
else:
|
||||||
|
yout = np.zeros((len(t), Ny), np.float_)
|
||||||
|
|
||||||
|
yout[0] = y0
|
||||||
|
|
||||||
|
for i in np.arange(len(t) - 1):
|
||||||
|
|
||||||
|
this = t[i]
|
||||||
|
dt = t[i + 1] - this
|
||||||
|
dt2 = dt / 2.0
|
||||||
|
y0 = yout[i]
|
||||||
|
|
||||||
|
k1 = np.asarray(derivs(y0))
|
||||||
|
k2 = np.asarray(derivs(y0 + dt2 * k1))
|
||||||
|
k3 = np.asarray(derivs(y0 + dt2 * k2))
|
||||||
|
k4 = np.asarray(derivs(y0 + dt * k3))
|
||||||
|
yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
|
||||||
|
# We only care about the final timestep and we cleave off action value which will be zero
|
||||||
|
return yout[-1][:4]
|
@ -123,9 +123,9 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
self.steps_beyond_terminated = None
|
self.steps_beyond_terminated = None
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
err_msg = f"{action!r} ({type(action)}) invalid"
|
# err_msg = f"{action!r} ({type(action)}) invalid"
|
||||||
assert self.action_space.contains(action), err_msg
|
# assert self.action_space.contains(action), err_msg
|
||||||
assert self.state is not None, "Call reset before using step method."
|
# assert self.state is not None, "Call reset before using step method."
|
||||||
x, x_dot, theta, theta_dot = self.state
|
x, x_dot, theta, theta_dot = self.state
|
||||||
# changed usage of action due to policy shaping approach
|
# changed usage of action due to policy shaping approach
|
||||||
force = action * self.force_mag
|
force = action * self.force_mag
|
||||||
@ -153,7 +153,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
theta_dot = theta_dot + self.tau * thetaacc
|
theta_dot = theta_dot + self.tau * thetaacc
|
||||||
theta = theta + self.tau * theta_dot
|
theta = theta + self.tau * theta_dot
|
||||||
|
|
||||||
self.state = (x, x_dot[0], theta, theta_dot[0])
|
self.state = (x, x_dot, theta, theta_dot)
|
||||||
|
|
||||||
terminated = bool(
|
terminated = bool(
|
||||||
x < -self.x_threshold
|
x < -self.x_threshold
|
||||||
@ -181,7 +181,6 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
|
|
||||||
if self.render_mode == "human":
|
if self.render_mode == "human":
|
||||||
self.render()
|
self.render()
|
||||||
|
|
||||||
return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
|
return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
|
||||||
|
|
||||||
def reset(
|
def reset(
|
||||||
|
@ -126,7 +126,7 @@ class PendulumEnv(gym.Env):
|
|||||||
|
|
||||||
u = 2 * u # scaling the action to +/- 2 Nm
|
u = 2 * u # scaling the action to +/- 2 Nm
|
||||||
|
|
||||||
u = np.clip(u, -self.max_torque, self.max_torque)[0]
|
u = np.clip(u, -self.max_torque, self.max_torque)
|
||||||
self.last_u = u # for rendering
|
self.last_u = u # for rendering
|
||||||
costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
|
costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
|
||||||
|
|
||||||
|
@ -9,7 +9,11 @@ from rclpy.node import Node
|
|||||||
from rclpy.callback_groups import ReentrantCallbackGroup
|
from rclpy.callback_groups import ReentrantCallbackGroup
|
||||||
|
|
||||||
from active_bo_ros.BayesianOptimization.BayesianOptimization import BayesianOptimization
|
from active_bo_ros.BayesianOptimization.BayesianOptimization import BayesianOptimization
|
||||||
|
|
||||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
@ -34,12 +38,13 @@ class ActiveBOTopic(Node):
|
|||||||
1, callback_group=bo_callback_group)
|
1, callback_group=bo_callback_group)
|
||||||
|
|
||||||
self.active_bo_pending = False
|
self.active_bo_pending = False
|
||||||
|
self.bo_env = None
|
||||||
self.bo_nr_weights = None
|
self.bo_nr_weights = None
|
||||||
self.bo_steps = None
|
self.bo_steps = None
|
||||||
self.bo_episodes = None
|
self.bo_episodes = None
|
||||||
self.bo_runs = None
|
self.bo_runs = None
|
||||||
self.bo_acq_fcn = None
|
self.bo_acq_fcn = None
|
||||||
self.bo_epsilon = None
|
self.bo_metric_parameter = None
|
||||||
self.current_run = 0
|
self.current_run = 0
|
||||||
self.current_episode = 0
|
self.current_episode = 0
|
||||||
|
|
||||||
@ -58,7 +63,7 @@ class ActiveBOTopic(Node):
|
|||||||
self.rl_reward = None
|
self.rl_reward = None
|
||||||
|
|
||||||
# RL Environments and BO
|
# RL Environments and BO
|
||||||
self.env = Continuous_MountainCarEnv()
|
self.env = None
|
||||||
self.distance_penalty = 0
|
self.distance_penalty = 0
|
||||||
|
|
||||||
self.BO = None
|
self.BO = None
|
||||||
@ -75,12 +80,13 @@ class ActiveBOTopic(Node):
|
|||||||
callback_group=mainloop_callback_group)
|
callback_group=mainloop_callback_group)
|
||||||
|
|
||||||
def reset_bo_request(self):
|
def reset_bo_request(self):
|
||||||
|
self.bo_env = None
|
||||||
self.bo_nr_weights = None
|
self.bo_nr_weights = None
|
||||||
self.bo_steps = None
|
self.bo_steps = None
|
||||||
self.bo_episodes = None
|
self.bo_episodes = None
|
||||||
self.bo_runs = None
|
self.bo_runs = None
|
||||||
self.bo_acq_fcn = None
|
self.bo_acq_fcn = None
|
||||||
self.bo_epsilon = None
|
self.bo_metric_parameter = None
|
||||||
self.current_run = 0
|
self.current_run = 0
|
||||||
self.current_episode = 0
|
self.current_episode = 0
|
||||||
|
|
||||||
@ -88,12 +94,13 @@ class ActiveBOTopic(Node):
|
|||||||
if not self.active_bo_pending:
|
if not self.active_bo_pending:
|
||||||
self.get_logger().info('Active Bayesian Optimization request pending!')
|
self.get_logger().info('Active Bayesian Optimization request pending!')
|
||||||
self.active_bo_pending = True
|
self.active_bo_pending = True
|
||||||
|
self.bo_env = msg.env
|
||||||
self.bo_nr_weights = msg.nr_weights
|
self.bo_nr_weights = msg.nr_weights
|
||||||
self.bo_steps = msg.max_steps
|
self.bo_steps = msg.max_steps
|
||||||
self.bo_episodes = msg.nr_episodes
|
self.bo_episodes = msg.nr_episodes
|
||||||
self.bo_runs = msg.nr_runs
|
self.bo_runs = msg.nr_runs
|
||||||
self.bo_acq_fcn = msg.acquisition_function
|
self.bo_acq_fcn = msg.acquisition_function
|
||||||
self.bo_epsilon = msg.epsilon
|
self.bo_metric_parameter = msg.metric_parameter
|
||||||
|
|
||||||
# initialize
|
# initialize
|
||||||
self.reward = np.zeros((self.bo_episodes, self.bo_runs))
|
self.reward = np.zeros((self.bo_episodes, self.bo_runs))
|
||||||
@ -116,6 +123,18 @@ class ActiveBOTopic(Node):
|
|||||||
|
|
||||||
def mainloop_callback(self):
|
def mainloop_callback(self):
|
||||||
if self.active_bo_pending:
|
if self.active_bo_pending:
|
||||||
|
# set rl environment
|
||||||
|
if self.bo_env == "Mountain Car":
|
||||||
|
self.env = Continuous_MountainCarEnv()
|
||||||
|
elif self.bo_env == "Cartpole":
|
||||||
|
self.env = CartPoleEnv()
|
||||||
|
elif self.bo_env == "Acrobot":
|
||||||
|
self.env = AcrobotEnv()
|
||||||
|
elif self.bo_env == "Pendulum":
|
||||||
|
self.env = PendulumEnv()
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
if self.BO is None:
|
if self.BO is None:
|
||||||
self.BO = BayesianOptimization(self.env,
|
self.BO = BayesianOptimization(self.env,
|
||||||
self.bo_steps,
|
self.bo_steps,
|
||||||
@ -139,6 +158,7 @@ class ActiveBOTopic(Node):
|
|||||||
self.active_bo_pub.publish(bo_response)
|
self.active_bo_pub.publish(bo_response)
|
||||||
self.reset_bo_request()
|
self.reset_bo_request()
|
||||||
self.active_bo_pending = False
|
self.active_bo_pending = False
|
||||||
|
self.BO = None
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if self.active_rl_pending:
|
if self.active_rl_pending:
|
||||||
@ -151,10 +171,11 @@ class ActiveBOTopic(Node):
|
|||||||
self.get_logger().error(f'Active Reinforcement Learning failed to add new observation: {e}')
|
self.get_logger().error(f'Active Reinforcement Learning failed to add new observation: {e}')
|
||||||
else:
|
else:
|
||||||
if self.current_episode < self.bo_episodes:
|
if self.current_episode < self.bo_episodes:
|
||||||
if np.random.uniform(0.0, 1.0, 1) < self.bo_epsilon:
|
if np.random.uniform(0.0, 1.0, 1) < self.bo_metric_parameter:
|
||||||
active_rl_request = ActiveRL()
|
active_rl_request = ActiveRL()
|
||||||
old_policy, _, old_weights = self.BO.get_best_result()
|
old_policy, _, old_weights = self.BO.get_best_result()
|
||||||
|
|
||||||
|
active_rl_request.env = self.bo_env
|
||||||
active_rl_request.policy = old_policy.tolist()
|
active_rl_request.policy = old_policy.tolist()
|
||||||
active_rl_request.weights = old_weights.tolist()
|
active_rl_request.weights = old_weights.tolist()
|
||||||
|
|
||||||
|
@ -9,6 +9,10 @@ from rclpy.node import Node
|
|||||||
from rclpy.callback_groups import ReentrantCallbackGroup
|
from rclpy.callback_groups import ReentrantCallbackGroup
|
||||||
|
|
||||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
@ -32,6 +36,7 @@ class ActiveRLService(Node):
|
|||||||
1, callback_group=rl_callback_group)
|
1, callback_group=rl_callback_group)
|
||||||
|
|
||||||
self.active_rl_pending = False
|
self.active_rl_pending = False
|
||||||
|
self.rl_env = None
|
||||||
self.rl_policy = None
|
self.rl_policy = None
|
||||||
self.rl_weights = None
|
self.rl_weights = None
|
||||||
self.rl_reward = 0.0
|
self.rl_reward = 0.0
|
||||||
@ -58,7 +63,8 @@ class ActiveRLService(Node):
|
|||||||
self.eval_weights = None
|
self.eval_weights = None
|
||||||
|
|
||||||
# RL Environments
|
# RL Environments
|
||||||
self.env = Continuous_MountainCarEnv(render_mode='rgb_array')
|
self.env = None
|
||||||
|
|
||||||
self.distance_penalty = 0
|
self.distance_penalty = 0
|
||||||
self.best_pol_shown = False
|
self.best_pol_shown = False
|
||||||
|
|
||||||
@ -69,13 +75,26 @@ class ActiveRLService(Node):
|
|||||||
callback_group=mainloop_callback_group)
|
callback_group=mainloop_callback_group)
|
||||||
|
|
||||||
def reset_rl_request(self):
|
def reset_rl_request(self):
|
||||||
|
self.rl_env = None
|
||||||
self.rl_policy = None
|
self.rl_policy = None
|
||||||
self.rl_weights = None
|
self.rl_weights = None
|
||||||
|
|
||||||
def active_rl_callback(self, msg):
|
def active_rl_callback(self, msg):
|
||||||
self.rl_policy = msg.policy
|
self.rl_env = msg.env
|
||||||
|
self.rl_policy = np.array(msg.policy, dtype=np.float32)
|
||||||
self.rl_weights = msg.weights
|
self.rl_weights = msg.weights
|
||||||
|
|
||||||
|
if self.rl_env == "Mountain Car":
|
||||||
|
self.env = Continuous_MountainCarEnv(render_mode="rgb_array")
|
||||||
|
elif self.rl_env == "Cartpole":
|
||||||
|
self.env = CartPoleEnv(render_mode="rgb_array")
|
||||||
|
elif self.rl_env == "Acrobot":
|
||||||
|
self.env = AcrobotEnv(render_mode="rgb_array")
|
||||||
|
elif self.rl_env == "Pendulum":
|
||||||
|
self.env = PendulumEnv(render_mode="rgb_array")
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
self.get_logger().info('Active RL: Called!')
|
self.get_logger().info('Active RL: Called!')
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
self.active_rl_pending = True
|
self.active_rl_pending = True
|
||||||
@ -85,15 +104,20 @@ class ActiveRLService(Node):
|
|||||||
self.eval_weights = None
|
self.eval_weights = None
|
||||||
|
|
||||||
def active_rl_eval_callback(self, msg):
|
def active_rl_eval_callback(self, msg):
|
||||||
self.eval_policy = msg.policy
|
self.eval_policy = np.array(msg.policy, dtype=np.float32)
|
||||||
self.eval_weights = msg.weights
|
self.eval_weights = msg.weights
|
||||||
|
|
||||||
self.get_logger().info('Active RL Eval: Responded!')
|
self.get_logger().info('Active RL Eval: Responded!')
|
||||||
|
self.env.reset()
|
||||||
self.eval_response_received = True
|
self.eval_response_received = True
|
||||||
|
|
||||||
def next_image(self, policy):
|
def next_image(self, policy):
|
||||||
action = policy[self.rl_step]
|
action = policy[self.rl_step]
|
||||||
output = self.env.step(action)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
|
self.get_logger().info(str(action_clipped) + str(type(action_clipped)))
|
||||||
|
output = self.env.step(action_clipped.astype(np.float32))
|
||||||
|
|
||||||
|
self.get_logger().info(str(output))
|
||||||
|
|
||||||
self.rl_reward += output[1]
|
self.rl_reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
@ -133,7 +157,7 @@ class ActiveRLService(Node):
|
|||||||
self.rl_reward = 0.0
|
self.rl_reward = 0.0
|
||||||
|
|
||||||
eval_request = ActiveRL()
|
eval_request = ActiveRL()
|
||||||
eval_request.policy = self.rl_policy
|
eval_request.policy = self.rl_policy.tolist()
|
||||||
eval_request.weights = self.rl_weights
|
eval_request.weights = self.rl_weights
|
||||||
|
|
||||||
self.eval_pub.publish(eval_request)
|
self.eval_pub.publish(eval_request)
|
||||||
|
@ -5,6 +5,9 @@ import rclpy
|
|||||||
from rclpy.node import Node
|
from rclpy.node import Node
|
||||||
|
|
||||||
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv
|
||||||
|
from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -16,7 +19,7 @@ class RLService(Node):
|
|||||||
|
|
||||||
self.publisher = self.create_publisher(ImageFeedback, 'rl_feedback', 1)
|
self.publisher = self.create_publisher(ImageFeedback, 'rl_feedback', 1)
|
||||||
|
|
||||||
self.env = Continuous_MountainCarEnv(render_mode='rgb_array')
|
self.env = None
|
||||||
self.distance_penalty = 0
|
self.distance_penalty = 0
|
||||||
|
|
||||||
def rl_callback(self, request, response):
|
def rl_callback(self, request, response):
|
||||||
@ -25,13 +28,27 @@ class RLService(Node):
|
|||||||
|
|
||||||
reward = 0
|
reward = 0
|
||||||
step_count = 0
|
step_count = 0
|
||||||
policy = request.policy
|
policy = np.array(request.policy, dtype=np.float32)
|
||||||
|
rl_env = request.env
|
||||||
|
|
||||||
|
if rl_env == "Mountain Car":
|
||||||
|
self.env = Continuous_MountainCarEnv(render_mode="rgb_array")
|
||||||
|
elif rl_env == "Cartpole":
|
||||||
|
self.env = CartPoleEnv(render_mode="rgb_array")
|
||||||
|
elif rl_env == "Acrobot":
|
||||||
|
self.env = AcrobotEnv(render_mode="rgb_array")
|
||||||
|
elif rl_env == "Pendulum":
|
||||||
|
self.env = PendulumEnv(render_mode="rgb_array")
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
|
|
||||||
for i in range(len(policy)):
|
for i in range(len(policy)):
|
||||||
action = policy[i]
|
action = policy[i]
|
||||||
output = self.env.step(action)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
|
self.get_logger().info(str(action_clipped) + str(type(action_clipped)))
|
||||||
|
output = self.env.step(action_clipped.astype(np.float32))
|
||||||
|
|
||||||
reward += output[1]
|
reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
@ -64,6 +81,7 @@ class RLService(Node):
|
|||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
rclpy.init(args=args)
|
rclpy.init(args=args)
|
||||||
|
|
||||||
@ -73,5 +91,6 @@ def main(args=None):
|
|||||||
|
|
||||||
rclpy.shutdown()
|
rclpy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user