pre major rl eval change

2023-06-07 11:59:17 +02:00 · 2023-06-07 11:59:17 +02:00 · 1b9e099696
commit 1b9e099696
parent 355165e804
11 changed files with 40 additions and 40 deletions
--- a/src/active_bo_msgs/msg/ActiveBOResponse.msg
+++ b/src/active_bo_msgs/msg/ActiveBOResponse.msg
@ -1,4 +1,4 @@
-float32[] best_policy
-float32[] best_weights
-float32[] reward_mean
-float32[] reward_std
+float64[] best_policy
+float64[] best_weights
+float64[] reward_mean
+float64[] reward_std
--- a/src/active_bo_msgs/msg/ActiveBOState.msg
+++ b/src/active_bo_msgs/msg/ActiveBOState.msg
@ -1,4 +1,4 @@
 uint16 current_run
 uint16 current_episode
-float32 best_reward
-float32 last_user_reward
+float64 best_reward
+float64 last_user_reward
--- a/src/active_bo_msgs/msg/ActiveRL.msg
+++ b/src/active_bo_msgs/msg/ActiveRL.msg
@ -1,5 +1,5 @@
 string env
 uint32 seed
 bool final_run
-float32[] policy
-float32[] weights
+float64[] policy
+float64[] weights
--- a/src/active_bo_msgs/msg/ActiveRLResponse.msg
+++ b/src/active_bo_msgs/msg/ActiveRLResponse.msg
@ -1,3 +1,3 @@
-float32[] weights
+float64[] weights
 uint16 final_step
-float32 reward
+float64 reward
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOpt.py
--- a/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
+++ b/src/active_bo_ros/active_bo_ros/BayesianOptimization/BayesianOptimization.py
@ -51,7 +51,7 @@ class BayesianOptimization:
        for i in range(len(policy)):
            action = policy[i]
            action_clipped = action.clip(min=-1.0, max=1.0)
-            output = self.env.step(action_clipped.astype(np.float32))
+            output = self.env.step(action_clipped.astype(np.float64))

            env_reward += output[1]
            done = output[2]
@ -71,8 +71,8 @@ class BayesianOptimization:
        self.env.reset(seed=seed)
        self.reset_bo()

-        self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32)
-        self.Y = np.zeros((self.nr_init, 1), dtype=np.float32)
+        self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64)
+        self.Y = np.zeros((self.nr_init, 1), dtype=np.float64)

        for i in range(self.nr_init):
            self.policy_model.random_policy()
@ -122,34 +122,34 @@ class BayesianOptimization:
        return x_next

    def eval_new_observation(self, x_next, seed=None):
-        self.policy_model.weights = x_next
+        self.policy_model.weights = np.around(x_next, decimals=8)
        policy = self.policy_model.rollout()

        reward, step_count = self.runner(policy, seed=seed)

-        self.X = np.vstack((self.X, x_next), dtype=np.float32)
-        self.Y = np.vstack((self.Y, reward), dtype=np.float32)
+        self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64)
+        self.Y = np.vstack((self.Y, reward), dtype=np.float64)

        self.GP.fit(self.X, self.Y)

        if self.episode == 0:
            self.best_reward[0] = np.max(self.Y)
        else:
-            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
+            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)

        self.episode += 1
        return step_count

    def add_new_observation(self, reward, x_new):
-        self.X = np.vstack((self.X, x_new), dtype=np.float32)
-        self.Y = np.vstack((self.Y, reward), dtype=np.float32)
+        self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64)
+        self.Y = np.vstack((self.Y, reward), dtype=np.float64)

        self.GP.fit(self.X, self.Y)

        if self.episode == 0:
            self.best_reward[0] = np.max(self.Y)
        else:
-            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
+            self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)

        self.episode += 1

--- a/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
+++ b/src/active_bo_ros/active_bo_ros/PolicyModel/GaussianRBFModel.py
@ -24,7 +24,7 @@ class GaussianRBF:
        self.policy = np.zeros((self.nr_steps, 1))

    def random_policy(self):
-        self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
+        self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8)

    def rollout(self):
        self.policy = np.zeros((self.nr_steps, 1))
--- a/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
+++ b/src/active_bo_ros/active_bo_ros/ReinforcementLearning/CartPole.py
@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
        high = np.array(
            [
                self.x_threshold * 2,
-                np.finfo(np.float32).max,
+                np.finfo(np.float64).max,
                self.theta_threshold_radians * 2,
-                np.finfo(np.float32).max,
+                np.finfo(np.float64).max,
            ],
-            dtype=np.float32,
+            dtype=np.float64,
        )

-        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
-        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
+        self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)
+        self.observation_space = spaces.Box(-high, high, dtype=np.float64)

        self.render_mode = render_mode

@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
        if self.render_mode == "human":
            self.render()

-        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
+        return np.array(self.state, dtype=np.float64), reward, terminated, False, {}

    def reset(
        self,
@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):

        if self.render_mode == "human":
            self.render()
-        return np.array(self.state, dtype=np.float32), {}
+        return np.array(self.state, dtype=np.float64), {}

    def render(self):
        if self.render_mode is None:
--- a/src/active_bo_ros/active_bo_ros/active_bo_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_bo_topic.py
@ -329,13 +329,13 @@ class ActiveBOTopic(Node):
            if self.BO is not None and self.BO.Y is not None:
                self.best_reward = np.max(self.BO.Y)

-            state_msg = ActiveBOState()
-            state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
-            state_msg.current_episode = self.current_episode + 1 \
-                if self.current_episode < self.bo_episodes else self.bo_episodes
-            state_msg.best_reward = float(self.best_reward)
-            state_msg.last_user_reward = float(self.rl_reward)
-            self.state_pub.publish(state_msg)
+                state_msg = ActiveBOState()
+                state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
+                state_msg.current_episode = self.current_episode + 1 \
+                    if self.current_episode < self.bo_episodes else self.bo_episodes
+                state_msg.best_reward = float(self.best_reward)
+                state_msg.last_user_reward = float(self.rl_reward)
+                self.state_pub.publish(state_msg)


 def main(args=None):
--- a/src/active_bo_ros/active_bo_ros/active_rl_topic.py
+++ b/src/active_bo_ros/active_bo_ros/active_rl_topic.py
@ -86,7 +86,7 @@ class ActiveRLService(Node):
    def active_rl_callback(self, msg):
        self.rl_env = msg.env
        self.rl_seed = msg.seed
-        self.rl_policy = np.array(msg.policy, dtype=np.float32)
+        self.rl_policy = np.array(msg.policy, dtype=np.float64)
        self.rl_weights = msg.weights
        self.final_run = msg.final_run

@ -112,7 +112,7 @@ class ActiveRLService(Node):
        self.eval_weights = None

    def active_rl_eval_callback(self, msg):
-        self.eval_policy = np.array(msg.policy, dtype=np.float32)
+        self.eval_policy = np.array(msg.policy, dtype=np.float64)
        self.eval_weights = msg.weights

        self.get_logger().info('Active RL Eval: Responded!')
@ -122,7 +122,7 @@ class ActiveRLService(Node):
    def next_image(self, policy):
        action = policy[self.rl_step]
        action_clipped = action.clip(min=-1.0, max=1.0)
-        output = self.env.step(action_clipped.astype(np.float32))
+        output = self.env.step(action_clipped.astype(np.float64))

        self.rl_reward += output[1]
        done = output[2]
--- a/src/active_bo_ros/active_bo_ros/rl_service.py
+++ b/src/active_bo_ros/active_bo_ros/rl_service.py
@ -27,7 +27,7 @@ class RLService(Node):

        reward = 0
        step_count = 0
-        policy = np.array(request.policy, dtype=np.float32)
+        policy = np.array(request.policy, dtype=np.float64)
        rl_env = request.env

        if rl_env == "Mountain Car":
@ -46,7 +46,7 @@ class RLService(Node):
        for i in range(len(policy)):
            action = policy[i]
            action_clipped = action.clip(min=-1.0, max=1.0)
-            output = self.env.step(action_clipped.astype(np.float32))
+            output = self.env.step(action_clipped.astype(np.float64))

            reward += output[1]
            done = output[2]