From 1b9e099696d0d341b269d9e462c93efb5d8fec3d Mon Sep 17 00:00:00 2001 From: Niko Date: Wed, 7 Jun 2023 11:59:17 +0200 Subject: [PATCH] pre major rl eval change --- src/active_bo_msgs/msg/ActiveBOResponse.msg | 8 ++++---- src/active_bo_msgs/msg/ActiveBOState.msg | 4 ++-- src/active_bo_msgs/msg/ActiveRL.msg | 4 ++-- src/active_bo_msgs/msg/ActiveRLResponse.msg | 4 ++-- .../BayesianOptimization/BayesianOpt.py | 0 .../BayesianOptimization.py | 20 +++++++++---------- .../PolicyModel/GaussianRBFModel.py | 2 +- .../ReinforcementLearning/CartPole.py | 14 ++++++------- .../active_bo_ros/active_bo_topic.py | 14 ++++++------- .../active_bo_ros/active_rl_topic.py | 6 +++--- src/active_bo_ros/active_bo_ros/rl_service.py | 4 ++-- 11 files changed, 40 insertions(+), 40 deletions(-) create mode 100644 src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py diff --git a/src/active_bo_msgs/msg/ActiveBOResponse.msg b/src/active_bo_msgs/msg/ActiveBOResponse.msg index 8cae313..e5684bd 100644 --- a/src/active_bo_msgs/msg/ActiveBOResponse.msg +++ b/src/active_bo_msgs/msg/ActiveBOResponse.msg @@ -1,4 +1,4 @@ -float32[] best_policy -float32[] best_weights -float32[] reward_mean -float32[] reward_std \ No newline at end of file +float64[] best_policy +float64[] best_weights +float64[] reward_mean +float64[] reward_std \ No newline at end of file diff --git a/src/active_bo_msgs/msg/ActiveBOState.msg b/src/active_bo_msgs/msg/ActiveBOState.msg index c6df26e..4c4f321 100644 --- a/src/active_bo_msgs/msg/ActiveBOState.msg +++ b/src/active_bo_msgs/msg/ActiveBOState.msg @@ -1,4 +1,4 @@ uint16 current_run uint16 current_episode -float32 best_reward -float32 last_user_reward \ No newline at end of file +float64 best_reward +float64 last_user_reward \ No newline at end of file diff --git a/src/active_bo_msgs/msg/ActiveRL.msg b/src/active_bo_msgs/msg/ActiveRL.msg index 11b20dd..f619bdb 100644 --- a/src/active_bo_msgs/msg/ActiveRL.msg +++ b/src/active_bo_msgs/msg/ActiveRL.msg @@ -1,5 +1,5 @@ string env uint32 seed bool final_run -float32[] policy -float32[] weights \ No newline at end of file +float64[] policy +float64[] weights \ No newline at end of file diff --git a/src/active_bo_msgs/msg/ActiveRLResponse.msg b/src/active_bo_msgs/msg/ActiveRLResponse.msg index c81eda8..9565fc8 100644 --- a/src/active_bo_msgs/msg/ActiveRLResponse.msg +++ b/src/active_bo_msgs/msg/ActiveRLResponse.msg @@ -1,3 +1,3 @@ -float32[] weights +float64[] weights uint16 final_step -float32 reward \ No newline at end of file +float64 reward \ No newline at end of file diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py new file mode 100644 index 0000000..e69de29 diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py index 080940c..6a5c48e 100644 --- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py +++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py @@ -51,7 +51,7 @@ class BayesianOptimization: for i in range(len(policy)): action = policy[i] action_clipped = action.clip(min=-1.0, max=1.0) - output = self.env.step(action_clipped.astype(np.float32)) + output = self.env.step(action_clipped.astype(np.float64)) env_reward += output[1] done = output[2] @@ -71,8 +71,8 @@ class BayesianOptimization: self.env.reset(seed=seed) self.reset_bo() - self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32) - self.Y = np.zeros((self.nr_init, 1), dtype=np.float32) + self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64) + self.Y = np.zeros((self.nr_init, 1), dtype=np.float64) for i in range(self.nr_init): self.policy_model.random_policy() @@ -122,34 +122,34 @@ class BayesianOptimization: return x_next def eval_new_observation(self, x_next, seed=None): - self.policy_model.weights = x_next + self.policy_model.weights = np.around(x_next, decimals=8) policy = self.policy_model.rollout() reward, step_count = self.runner(policy, seed=seed) - self.X = np.vstack((self.X, x_next), dtype=np.float32) - self.Y = np.vstack((self.Y, reward), dtype=np.float32) + self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64) + self.Y = np.vstack((self.Y, reward), dtype=np.float64) self.GP.fit(self.X, self.Y) if self.episode == 0: self.best_reward[0] = np.max(self.Y) else: - self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32) + self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64) self.episode += 1 return step_count def add_new_observation(self, reward, x_new): - self.X = np.vstack((self.X, x_new), dtype=np.float32) - self.Y = np.vstack((self.Y, reward), dtype=np.float32) + self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64) + self.Y = np.vstack((self.Y, reward), dtype=np.float64) self.GP.fit(self.X, self.Y) if self.episode == 0: self.best_reward[0] = np.max(self.Y) else: - self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32) + self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64) self.episode += 1 diff --git a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py index b517fb9..b0c2a8b 100644 --- a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py +++ b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py @@ -24,7 +24,7 @@ class GaussianRBF: self.policy = np.zeros((self.nr_steps, 1)) def random_policy(self): - self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights) + self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8) def rollout(self): self.policy = np.zeros((self.nr_steps, 1)) diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py index 5b3b743..b2c9660 100644 --- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py +++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py @@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): high = np.array( [ self.x_threshold * 2, - np.finfo(np.float32).max, + np.finfo(np.float64).max, self.theta_threshold_radians * 2, - np.finfo(np.float32).max, + np.finfo(np.float64).max, ], - dtype=np.float32, + dtype=np.float64, ) - self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32) - self.observation_space = spaces.Box(-high, high, dtype=np.float32) + self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64) + self.observation_space = spaces.Box(-high, high, dtype=np.float64) self.render_mode = render_mode @@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): if self.render_mode == "human": self.render() - return np.array(self.state, dtype=np.float32), reward, terminated, False, {} + return np.array(self.state, dtype=np.float64), reward, terminated, False, {} def reset( self, @@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): if self.render_mode == "human": self.render() - return np.array(self.state, dtype=np.float32), {} + return np.array(self.state, dtype=np.float64), {} def render(self): if self.render_mode is None: diff --git a/src/active_bo_ros/active_bo_ros/active_bo_topic.py b/src/active_bo_ros/active_bo_ros/active_bo_topic.py index 6bc3988..a08c46b 100644 --- a/src/active_bo_ros/active_bo_ros/active_bo_topic.py +++ b/src/active_bo_ros/active_bo_ros/active_bo_topic.py @@ -329,13 +329,13 @@ class ActiveBOTopic(Node): if self.BO is not None and self.BO.Y is not None: self.best_reward = np.max(self.BO.Y) - state_msg = ActiveBOState() - state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs - state_msg.current_episode = self.current_episode + 1 \ - if self.current_episode < self.bo_episodes else self.bo_episodes - state_msg.best_reward = float(self.best_reward) - state_msg.last_user_reward = float(self.rl_reward) - self.state_pub.publish(state_msg) + state_msg = ActiveBOState() + state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs + state_msg.current_episode = self.current_episode + 1 \ + if self.current_episode < self.bo_episodes else self.bo_episodes + state_msg.best_reward = float(self.best_reward) + state_msg.last_user_reward = float(self.rl_reward) + self.state_pub.publish(state_msg) def main(args=None): diff --git a/src/active_bo_ros/active_bo_ros/active_rl_topic.py b/src/active_bo_ros/active_bo_ros/active_rl_topic.py index 9a5d56a..32b39bb 100644 --- a/src/active_bo_ros/active_bo_ros/active_rl_topic.py +++ b/src/active_bo_ros/active_bo_ros/active_rl_topic.py @@ -86,7 +86,7 @@ class ActiveRLService(Node): def active_rl_callback(self, msg): self.rl_env = msg.env self.rl_seed = msg.seed - self.rl_policy = np.array(msg.policy, dtype=np.float32) + self.rl_policy = np.array(msg.policy, dtype=np.float64) self.rl_weights = msg.weights self.final_run = msg.final_run @@ -112,7 +112,7 @@ class ActiveRLService(Node): self.eval_weights = None def active_rl_eval_callback(self, msg): - self.eval_policy = np.array(msg.policy, dtype=np.float32) + self.eval_policy = np.array(msg.policy, dtype=np.float64) self.eval_weights = msg.weights self.get_logger().info('Active RL Eval: Responded!') @@ -122,7 +122,7 @@ class ActiveRLService(Node): def next_image(self, policy): action = policy[self.rl_step] action_clipped = action.clip(min=-1.0, max=1.0) - output = self.env.step(action_clipped.astype(np.float32)) + output = self.env.step(action_clipped.astype(np.float64)) self.rl_reward += output[1] done = output[2] diff --git a/src/active_bo_ros/active_bo_ros/rl_service.py b/src/active_bo_ros/active_bo_ros/rl_service.py index 2c70046..18aa9c2 100644 --- a/src/active_bo_ros/active_bo_ros/rl_service.py +++ b/src/active_bo_ros/active_bo_ros/rl_service.py @@ -27,7 +27,7 @@ class RLService(Node): reward = 0 step_count = 0 - policy = np.array(request.policy, dtype=np.float32) + policy = np.array(request.policy, dtype=np.float64) rl_env = request.env if rl_env == "Mountain Car": @@ -46,7 +46,7 @@ class RLService(Node): for i in range(len(policy)): action = policy[i] action_clipped = action.clip(min=-1.0, max=1.0) - output = self.env.step(action_clipped.astype(np.float32)) + output = self.env.step(action_clipped.astype(np.float64)) reward += output[1] done = output[2]