From 1b9e099696d0d341b269d9e462c93efb5d8fec3d Mon Sep 17 00:00:00 2001
From: Niko <nikolaus.feith@unileoben.ac.at>
Date: Wed, 7 Jun 2023 11:59:17 +0200
Subject: [PATCH] pre major rl eval change

---
 src/active_bo_msgs/msg/ActiveBOResponse.msg   |  8 ++++----
 src/active_bo_msgs/msg/ActiveBOState.msg      |  4 ++--
 src/active_bo_msgs/msg/ActiveRL.msg           |  4 ++--
 src/active_bo_msgs/msg/ActiveRLResponse.msg   |  4 ++--
 .../BayesianOptimization/BayesianOpt.py       |  0
 .../BayesianOptimization.py                   | 20 +++++++++----------
 .../PolicyModel/GaussianRBFModel.py           |  2 +-
 .../ReinforcementLearning/CartPole.py         | 14 ++++++-------
 .../active_bo_ros/active_bo_topic.py          | 14 ++++++-------
 .../active_bo_ros/active_rl_topic.py          |  6 +++---
 src/active_bo_ros/active_bo_ros/rl_service.py |  4 ++--
 11 files changed, 40 insertions(+), 40 deletions(-)
 create mode 100644 src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py

diff --git a/src/active_bo_msgs/msg/ActiveBOResponse.msg b/src/active_bo_msgs/msg/ActiveBOResponse.msg
index 8cae313..e5684bd 100644
--- a/src/active_bo_msgs/msg/ActiveBOResponse.msg
+++ b/src/active_bo_msgs/msg/ActiveBOResponse.msg
@@ -1,4 +1,4 @@
-float32[] best_policy
-float32[] best_weights
-float32[] reward_mean
-float32[] reward_std
\ No newline at end of file
+float64[] best_policy
+float64[] best_weights
+float64[] reward_mean
+float64[] reward_std
\ No newline at end of file
diff --git a/src/active_bo_msgs/msg/ActiveBOState.msg b/src/active_bo_msgs/msg/ActiveBOState.msg
index c6df26e..4c4f321 100644
--- a/src/active_bo_msgs/msg/ActiveBOState.msg
+++ b/src/active_bo_msgs/msg/ActiveBOState.msg
@@ -1,4 +1,4 @@
 uint16 current_run
 uint16 current_episode
-float32 best_reward
-float32 last_user_reward
\ No newline at end of file
+float64 best_reward
+float64 last_user_reward
\ No newline at end of file
diff --git a/src/active_bo_msgs/msg/ActiveRL.msg b/src/active_bo_msgs/msg/ActiveRL.msg
index 11b20dd..f619bdb 100644
--- a/src/active_bo_msgs/msg/ActiveRL.msg
+++ b/src/active_bo_msgs/msg/ActiveRL.msg
@@ -1,5 +1,5 @@
 string env
 uint32 seed
 bool final_run
-float32[] policy
-float32[] weights
\ No newline at end of file
+float64[] policy
+float64[] weights
\ No newline at end of file
diff --git a/src/active_bo_msgs/msg/ActiveRLResponse.msg b/src/active_bo_msgs/msg/ActiveRLResponse.msg
index c81eda8..9565fc8 100644
--- a/src/active_bo_msgs/msg/ActiveRLResponse.msg
+++ b/src/active_bo_msgs/msg/ActiveRLResponse.msg
@@ -1,3 +1,3 @@
-float32[] weights
+float64[] weights
 uint16 final_step
-float32 reward
\ No newline at end of file
+float64 reward
\ No newline at end of file
diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
index 080940c..6a5c48e 100644
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
@@ -51,7 +51,7 @@ class BayesianOptimization:
         for i in range(len(policy)):
             action = policy[i]
             action_clipped = action.clip(min=-1.0, max=1.0)
-            output = self.env.step(action_clipped.astype(np.float32))
+            output = self.env.step(action_clipped.astype(np.float64))
 
             env_reward += output[1]
             done = output[2]
@@ -71,8 +71,8 @@ class BayesianOptimization:
         self.env.reset(seed=seed)
         self.reset_bo()
 
-        self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32)
-        self.Y = np.zeros((self.nr_init, 1), dtype=np.float32)
+        self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64)
+        self.Y = np.zeros((self.nr_init, 1), dtype=np.float64)
 
         for i in range(self.nr_init):
             self.policy_model.random_policy()
@@ -122,34 +122,34 @@ class BayesianOptimization:
         return x_next
 
     def eval_new_observation(self, x_next, seed=None):
-        self.policy_model.weights = x_next
+        self.policy_model.weights = np.around(x_next, decimals=8)
         policy = self.policy_model.rollout()
 
         reward, step_count = self.runner(policy, seed=seed)
 
-        self.X = np.vstack((self.X, x_next), dtype=np.float32)
-        self.Y = np.vstack((self.Y, reward), dtype=np.float32)
+        self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64)
+        self.Y = np.vstack((self.Y, reward), dtype=np.float64)
 
         self.GP.fit(self.X, self.Y)
 
         if self.episode == 0:
             self.best_reward[0] = np.max(self.Y)
         else:
-            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
+            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
 
         self.episode += 1
         return step_count
 
     def add_new_observation(self, reward, x_new):
-        self.X = np.vstack((self.X, x_new), dtype=np.float32)
-        self.Y = np.vstack((self.Y, reward), dtype=np.float32)
+        self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64)
+        self.Y = np.vstack((self.Y, reward), dtype=np.float64)
 
         self.GP.fit(self.X, self.Y)
 
         if self.episode == 0:
             self.best_reward[0] = np.max(self.Y)
         else:
-            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
+            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
 
         self.episode += 1
 
diff --git a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
index b517fb9..b0c2a8b 100644
--- a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
+++ b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
@@ -24,7 +24,7 @@ class GaussianRBF:
         self.policy = np.zeros((self.nr_steps, 1))
 
     def random_policy(self):
-        self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
+        self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8)
 
     def rollout(self):
         self.policy = np.zeros((self.nr_steps, 1))
diff --git a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
index 5b3b743..b2c9660 100644
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
@@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
         high = np.array(
             [
                 self.x_threshold * 2,
-                np.finfo(np.float32).max,
+                np.finfo(np.float64).max,
                 self.theta_threshold_radians * 2,
-                np.finfo(np.float32).max,
+                np.finfo(np.float64).max,
             ],
-            dtype=np.float32,
+            dtype=np.float64,
         )
 
-        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
-        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
+        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)
+        self.observation_space = spaces.Box(-high, high, dtype=np.float64)
 
         self.render_mode = render_mode
 
@@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
         if self.render_mode == "human":
             self.render()
 
-        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
+        return np.array(self.state, dtype=np.float64), reward, terminated, False, {}
 
     def reset(
         self,
@@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
 
         if self.render_mode == "human":
             self.render()
-        return np.array(self.state, dtype=np.float32), {}
+        return np.array(self.state, dtype=np.float64), {}
 
     def render(self):
         if self.render_mode is None:
diff --git a/src/active_bo_ros/active_bo_ros/active_bo_topic.py b/src/active_bo_ros/active_bo_ros/active_bo_topic.py
index 6bc3988..a08c46b 100644
--- a/src/active_bo_ros/active_bo_ros/active_bo_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_bo_topic.py
@@ -329,13 +329,13 @@ class ActiveBOTopic(Node):
             if self.BO is not None and self.BO.Y is not None:
                 self.best_reward = np.max(self.BO.Y)
 
-            state_msg = ActiveBOState()
-            state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
-            state_msg.current_episode = self.current_episode + 1 \
-                if self.current_episode < self.bo_episodes else self.bo_episodes
-            state_msg.best_reward = float(self.best_reward)
-            state_msg.last_user_reward = float(self.rl_reward)
-            self.state_pub.publish(state_msg)
+                state_msg = ActiveBOState()
+                state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
+                state_msg.current_episode = self.current_episode + 1 \
+                    if self.current_episode < self.bo_episodes else self.bo_episodes
+                state_msg.best_reward = float(self.best_reward)
+                state_msg.last_user_reward = float(self.rl_reward)
+                self.state_pub.publish(state_msg)
 
 
 def main(args=None):
diff --git a/src/active_bo_ros/active_bo_ros/active_rl_topic.py b/src/active_bo_ros/active_bo_ros/active_rl_topic.py
index 9a5d56a..32b39bb 100644
--- a/src/active_bo_ros/active_bo_ros/active_rl_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_rl_topic.py
@@ -86,7 +86,7 @@ class ActiveRLService(Node):
     def active_rl_callback(self, msg):
         self.rl_env = msg.env
         self.rl_seed = msg.seed
-        self.rl_policy = np.array(msg.policy, dtype=np.float32)
+        self.rl_policy = np.array(msg.policy, dtype=np.float64)
         self.rl_weights = msg.weights
         self.final_run = msg.final_run
 
@@ -112,7 +112,7 @@ class ActiveRLService(Node):
         self.eval_weights = None
 
     def active_rl_eval_callback(self, msg):
-        self.eval_policy = np.array(msg.policy, dtype=np.float32)
+        self.eval_policy = np.array(msg.policy, dtype=np.float64)
         self.eval_weights = msg.weights
 
         self.get_logger().info('Active RL Eval: Responded!')
@@ -122,7 +122,7 @@ class ActiveRLService(Node):
     def next_image(self, policy):
         action = policy[self.rl_step]
         action_clipped = action.clip(min=-1.0, max=1.0)
-        output = self.env.step(action_clipped.astype(np.float32))
+        output = self.env.step(action_clipped.astype(np.float64))
 
         self.rl_reward += output[1]
         done = output[2]
diff --git a/src/active_bo_ros/active_bo_ros/rl_service.py b/src/active_bo_ros/active_bo_ros/rl_service.py
index 2c70046..18aa9c2 100644
--- a/src/active_bo_ros/active_bo_ros/rl_service.py
+++ b/src/active_bo_ros/active_bo_ros/rl_service.py
@@ -27,7 +27,7 @@ class RLService(Node):
 
         reward = 0
         step_count = 0
-        policy = np.array(request.policy, dtype=np.float32)
+        policy = np.array(request.policy, dtype=np.float64)
         rl_env = request.env
 
         if rl_env == "Mountain Car":
@@ -46,7 +46,7 @@ class RLService(Node):
         for i in range(len(policy)):
             action = policy[i]
             action_clipped = action.clip(min=-1.0, max=1.0)
-            output = self.env.step(action_clipped.astype(np.float32))
+            output = self.env.step(action_clipped.astype(np.float64))
 
             reward += output[1]
             done = output[2]