diff --git a/src/active_bo_msgs/msg/ActiveBORequest.msg b/src/active_bo_msgs/msg/ActiveBORequest.msg index b8f5e1a..5f7e22b 100644 --- a/src/active_bo_msgs/msg/ActiveBORequest.msg +++ b/src/active_bo_msgs/msg/ActiveBORequest.msg @@ -4,6 +4,7 @@ string metric uint16 nr_weights uint16 max_steps uint16 nr_episodes +uint16 nr_dims uint16 nr_runs string acquisition_function float32 metric_parameter diff --git a/src/active_bo_msgs/msg/ActiveRLRequest.msg b/src/active_bo_msgs/msg/ActiveRLRequest.msg index 83b4301..e59d3b8 100644 --- a/src/active_bo_msgs/msg/ActiveRLRequest.msg +++ b/src/active_bo_msgs/msg/ActiveRLRequest.msg @@ -3,4 +3,5 @@ uint32 seed bool display_run uint8 interactive_run float64[] policy -float64[] weights \ No newline at end of file +float64[] weights +uint16 nr_dims \ No newline at end of file diff --git a/src/active_bo_ros/active_bo_ros/interactive_bo_2d.py b/src/active_bo_ros/active_bo_ros/interactive_bo_2d.py index eab7b11..b53146a 100644 --- a/src/active_bo_ros/active_bo_ros/interactive_bo_2d.py +++ b/src/active_bo_ros/active_bo_ros/interactive_bo_2d.py @@ -36,7 +36,7 @@ class ActiveBOTopic(Node): self.active_bo_sub = self.create_subscription(ActiveBORequest, 'active_bo_request', - self.active_bo_callback, + self.bo_callback, 1, callback_group=bo_callback_group) self.active_bo_pending = False @@ -63,7 +63,7 @@ class ActiveBOTopic(Node): 1, callback_group=rl_callback_group) self.active_rl_sub = self.create_subscription(ActiveRLResponse, 'active_rl_response', - self.active_rl_callback, + self.rl_callback, 1, callback_group=rl_callback_group) self.rl_pending = False @@ -105,6 +105,7 @@ class ActiveBOTopic(Node): self.bo_metric = None self.bo_fixed_seed = False self.bo_nr_weights = None + self.bo_nr_dims = None self.bo_steps = 0 self.bo_episodes = 0 self.bo_runs = 0 @@ -120,7 +121,7 @@ class ActiveBOTopic(Node): self.BO = None self.overwrite = False - def active_bo_callback(self, msg): + def bo_callback(self, msg): if not self.active_bo_pending: # self.get_logger().info('Active Bayesian Optimization request pending!') self.active_bo_pending = True @@ -128,6 +129,7 @@ class ActiveBOTopic(Node): self.bo_metric = msg.metric self.bo_fixed_seed = msg.fixed_seed self.bo_nr_weights = msg.nr_weights + self.bo_nr_dims = msg.nr_dims self.bo_steps = msg.max_steps self.bo_episodes = msg.nr_episodes self.bo_runs = msg.nr_runs @@ -135,19 +137,18 @@ class ActiveBOTopic(Node): self.bo_metric_parameter = msg.metric_parameter self.bo_metric_parameter_2 = msg.metric_parameter_2 self.save_result = msg.save_result - self.seed_array = np.zeros((1, self.bo_runs)) + self.seed_array = np.zeros((self.bo_runs, 1)) self.overwrite = msg.overwrite # initialize - self.reward = np.zeros((self.bo_episodes + self.nr_init - 1, self.bo_runs)) - self.best_pol_reward = np.zeros((1, self.bo_runs)) - self.best_policy = np.zeros((self.bo_steps, self.bo_runs)) - self.best_weights = np.zeros((self.bo_nr_weights, self.bo_runs)) + self.reward = np.zeros((self.bo_runs, self.bo_episodes + self.nr_init - 1)) + self.best_pol_reward = np.zeros((self.bo_runs, 1)) + self.best_policy = np.zeros((self.bo_runs, self.bo_steps, self.bo_nr_dims)) + self.best_weights = np.zeros((self.bo_runs, self.bo_nr_weights, self.bo_nr_dims)) # set the seed if self.bo_fixed_seed: self.seed = int(np.random.randint(1, 2147483647, 1)[0]) - # self.get_logger().info(str(self.seed)) else: self.seed = None @@ -161,7 +162,7 @@ class ActiveBOTopic(Node): self.rl_weights = None self.rl_final_step = None - def active_rl_callback(self, msg): + def rl_callback(self, msg): if self.rl_pending: # self.get_logger().info('Active Reinforcement Learning response received!') self.rl_weights = np.array(msg.weights, dtype=np.float64) @@ -201,7 +202,7 @@ class ActiveBOTopic(Node): if self.BO is None: self.BO = BayesianOptimization(self.bo_steps, - 2, + self.bo_nr_dims, self.bo_nr_weights, acq=self.bo_acq_fcn) @@ -223,8 +224,9 @@ class ActiveBOTopic(Node): rl_msg.seed = seed rl_msg.display_run = False rl_msg.interactive_run = 2 - rl_msg.weights = self.BO.policy_model.random_policy().tolist() - rl_msg.policy = self.BO.policy_model.rollout().reshape(-1,).tolist() + rl_msg.weights = self.BO.policy_model.random_policy().flatten().tolist() + rl_msg.policy = self.BO.policy_model.rollout().flatten().tolist() + rl_msg.nr_dims = self.bo_nr_dims self.active_rl_pub.publish(rl_msg) @@ -234,25 +236,15 @@ class ActiveBOTopic(Node): bo_response = ActiveBOResponse() best_policy_idx = np.argmax(self.best_pol_reward) - bo_response.best_policy = self.best_policy[:, best_policy_idx].tolist() - bo_response.best_weights = self.best_weights[:, best_policy_idx].tolist() - - # self.get_logger().info(f'Best Policy: {self.best_pol_reward}') - - self.get_logger().info(f'{best_policy_idx}, {int(self.seed_array[0, best_policy_idx])}') + bo_response.best_policy = self.best_policy[best_policy_idx, :, :].flatten().tolist() + bo_response.best_weights = self.best_weights[best_policy_idx, :, :].flatten().tolist() bo_response.reward_mean = np.mean(self.reward, axis=1).tolist() bo_response.reward_std = np.std(self.reward, axis=1).tolist() if self.save_result: - if self.bo_env == "Mountain Car": - env = 'mc' - elif self.bo_env == "Cartpole": - env = 'cp' - elif self.bo_env == "Acrobot": - env = 'ab' - elif self.bo_env == "Pendulum": - env = 'pd' + if self.bo_env == "Reacher": + env = 're' else: raise NotImplementedError @@ -283,7 +275,6 @@ class ActiveBOTopic(Node): if self.bo_fixed_seed: seed = int(self.seed_array[0, best_policy_idx]) - # self.get_logger().info(f'Used seed{seed}') else: seed = int(np.random.randint(1, 2147483647, 1)[0]) @@ -291,8 +282,9 @@ class ActiveBOTopic(Node): active_rl_request.seed = seed active_rl_request.display_run = True active_rl_request.interactive_run = 1 - active_rl_request.policy = self.best_policy[:, best_policy_idx].tolist() - active_rl_request.weights = self.best_weights[:, best_policy_idx].tolist() + active_rl_request.policy = self.best_policy[best_policy_idx, :, :].flatten().tolist() + active_rl_request.weights = self.best_weights[best_policy_idx, :, :].flatten().tolist() + active_rl_request.nr_dims = self.bo_nr_dims self.active_rl_pub.publish(active_rl_request) @@ -316,7 +308,7 @@ class ActiveBOTopic(Node): user_query = ImprovementQuery(self.bo_metric_parameter, self.bo_metric_parameter_2, self.last_query, - self.reward[:self.current_episode, self.current_run]) + self.reward[self.current_run, :self.current_episode]) elif self.bo_metric == "max acquisition": user_query = MaxAcqQuery(self.bo_metric_parameter, @@ -347,30 +339,17 @@ class ActiveBOTopic(Node): active_rl_request.seed = seed active_rl_request.display_run = True active_rl_request.interactive_run = 0 - active_rl_request.policy = old_policy.tolist() - active_rl_request.weights = old_weights.tolist() + active_rl_request.policy = old_policy.flatten().tolist() + active_rl_request.weights = old_weights.flatten().tolist() + active_rl_request.nr_dims = self.bo_nr_dims # self.get_logger().info('Calling: Active RL') self.active_rl_pub.publish(active_rl_request) self.rl_pending = True else: - # if self.bo_acq_fcn == "Preference Expected Improvement": - # self.get_logger().info(f"{self.BO.acq_fun.proposal_mean}") - # self.get_logger().info(f"X: {self.BO.X}") x_next = self.BO.next_observation() - # self.get_logger().info(f'x_next: {x_next}') - # self.get_logger().info(f'overwrite: {self.weight_preference}') - # self.get_logger().info(f'rl_weights: {self.rl_weights}') - - if self.overwrite: - if self.weight_preference is not None and self.rl_weights is not None: - x_next[self.weight_preference] = self.rl_weights[self.weight_preference] - # self.get_logger().info(f'x_next: {x_next}') - # self.get_logger().info(f'overwrite: {self.weight_preference}') - # self.get_logger().info(f'rl_weights: {self.rl_weights}') - # self.get_logger().info('Next Observation BO!') - self.BO.policy_model.weights = np.around(x_next, decimals=8) + self.BO.policy_model.set_weights(np.around(x_next, decimals=8)) if self.bo_fixed_seed: seed = self.seed else: @@ -381,23 +360,23 @@ class ActiveBOTopic(Node): rl_msg.seed = seed rl_msg.display_run = False rl_msg.interactive_run = 2 - rl_msg.policy = self.BO.policy_model.rollout().reshape(-1,).tolist() - rl_msg.weights = x_next.tolist() + rl_msg.policy = self.BO.policy_model.rollout().flatten().tolist() + rl_msg.weights = x_next.flatten().tolist() + rl_msg.nr_dims = self.bo_nr_dims self.rl_pending = True self.active_rl_pub.publish(rl_msg) - self.reward[self.current_episode, self.current_run] = np.max(self.BO.Y) + self.reward[self.current_run, self.current_episode] = np.max(self.BO.Y) self.get_logger().info(f'Current Episode: {self.current_episode},' - f' best reward: {self.reward[self.current_episode, self.current_run]}') + f' best reward: {self.reward[self.current_run, self.current_episode]}') self.current_episode += 1 - else: - self.best_policy[:, self.current_run], \ - self.best_pol_reward[:, self.current_run], \ - self.best_weights[:, self.current_run], idx = self.BO.get_best_result() + self.best_policy[self.current_run, :, :], \ + self.best_pol_reward[self.current_run, :], \ + self.best_weights[self.current_run, :, :], idx = self.BO.get_best_result() if self.current_run < self.bo_runs - 1: self.BO = None @@ -405,7 +384,8 @@ class ActiveBOTopic(Node): self.current_episode = 0 self.last_query = 0 if self.bo_fixed_seed: - self.seed_array[0, self.current_run] = self.seed + self.seed_array[self.current_run, 0] = self.seed + else: self.seed = int(np.random.randint(1, 2147483647, 1)[0]) # self.get_logger().info(f'{self.seed}') self.current_run += 1 @@ -443,4 +423,3 @@ def main(args=None): if __name__ == '__main__': main() - diff --git a/src/active_bo_ros/active_bo_ros/interactive_rl_2d.py b/src/active_bo_ros/active_bo_ros/interactive_rl_2d.py index 0729c09..dc3735e 100644 --- a/src/active_bo_ros/active_bo_ros/interactive_rl_2d.py +++ b/src/active_bo_ros/active_bo_ros/interactive_rl_2d.py @@ -10,11 +10,7 @@ from rclpy.node import Node from rclpy.callback_groups import ReentrantCallbackGroup -from active_bo_ros.ReinforcementLearning.ContinuousMountainCar import Continuous_MountainCarEnv -from active_bo_ros.ReinforcementLearning.CartPole import CartPoleEnv -from active_bo_ros.ReinforcementLearning.Pendulum import PendulumEnv -from active_bo_ros.ReinforcementLearning.Acrobot import AcrobotEnv - +from dm_control import suite import numpy as np import time @@ -34,7 +30,7 @@ class ActiveRL(Node): 1, callback_group=rl_callback_group) self.active_rl_sub = self.create_subscription(ActiveRLRequest, 'active_rl_request', - self.active_rl_callback, + self.rl_callback, 1, callback_group=rl_callback_group) self.rl_env = None @@ -56,17 +52,20 @@ class ActiveRL(Node): callback_group=topic_callback_group) self.eval_sub = self.create_subscription(ActiveRLEvalResponse, 'active_rl_eval_response', - self.active_rl_eval_callback, + self.rl_eval_callback, 1, callback_group=topic_callback_group) self.eval_response_received = False self.eval_policy = None self.eval_weights = None - self.overwrite_weight = None + self.weight_preference = None # RL Environments self.env = None + self.rl_spec = None + self.rl_dims = None + self.pol_dims = None # State Machine Variables self.best_pol_shown = False @@ -83,61 +82,65 @@ class ActiveRL(Node): def reset_rl_request(self): self.rl_env = None + self.rl_spec = None self.rl_seed = None self.rl_policy = None self.rl_weights = None self.interactive_run = 0 self.display_run = False + self.rl_dims = None + self.pol_dims = None - def active_rl_callback(self, msg): + def rl_callback(self, msg): self.rl_env = msg.env self.rl_seed = msg.seed self.display_run = msg.display_run - self.rl_policy = np.array(msg.policy, dtype=np.float64) + self.rl_dims = msg.nr_dims self.rl_weights = msg.weights + pol = msg.policy + self.pol_dims = (len(pol)/self.rl_dims, self.rl_dims) + self.rl_policy = np.array(pol, dtype=np.float64).reshape(self.pol_dims) self.interactive_run = msg.interactive_run - if self.rl_env == "Mountain Car": - self.env = Continuous_MountainCarEnv(render_mode="rgb_array") - elif self.rl_env == "Cartpole": - self.env = CartPoleEnv(render_mode="rgb_array") - elif self.rl_env == "Acrobot": - self.env = AcrobotEnv(render_mode="rgb_array") - elif self.rl_env == "Pendulum": - self.env = PendulumEnv(render_mode="rgb_array") + if self.rl_env == "Reacher": + random_state = np.random.RandomState(seed=self.rl_seed) + self.env = suite.load('reacher', 'hard', task_kwargs={'random': random_state}) + self.rl_spec = self.env.action_spec() + self.env.reset() else: raise NotImplementedError - # self.get_logger().info('Active RL: Called!') - self.env.reset(seed=self.rl_seed) self.rl_pending = True self.policy_sent = False - self.rl_step = 0 def reset_eval_request(self): self.eval_policy = None self.eval_weights = None - def active_rl_eval_callback(self, msg): - self.eval_policy = np.array(msg.policy, dtype=np.float64) + def rl_eval_callback(self, msg): + self.eval_policy = np.array(msg.policy, dtype=np.float64).reshape(self.pol_dims) self.eval_weights = msg.weights - self.overwrite_weight = msg.overwrite_weight + self.weight_preference = msg.weight_preference self.get_logger().info('Active RL Eval: Responded!') - self.env.reset(seed=self.rl_seed) + self.env.reset() self.eval_response_received = True - def next_image(self, policy, display_run): - action = policy[self.rl_step] + def step(self, policy, display_run): + done = False + + action = policy[self.rl_step, :] action_clipped = action.clip(min=-1.0, max=1.0) output = self.env.step(action_clipped.astype(np.float64)) - self.rl_reward += output[1] - done = output[2] - self.rl_step += 1 + if output.reward != 0.0: + self.rl_reward += output.reward * 10 + done = True + else: + self.rl_step -= 1.0 if display_run: - rgb_array = self.env.render() + rgb_array = self.env.physics.render(camera_id=0, height=400, width=600) rgb_shape = rgb_array.shape red = rgb_array[:, :, 0].flatten().tolist() @@ -160,25 +163,27 @@ class ActiveRL(Node): return done - def complete_run(self, policy): + def runner(self, policy): env_reward = 0.0 step_count = 0 + done = False - self.env.reset(seed=self.rl_seed) + self.env.reset() - for i in range(len(policy)): - action = policy[i] + for i in range(policy.shape[0]): + action = policy[i, :] action_clipped = action.clip(min=-1.0, max=1.0) output = self.env.step(action_clipped.astype(np.float64)) - - env_reward += output[1] - done = output[2] step_count += 1 + if output.reward != 0.0: + self.rl_reward += output.reward * 10 + done = True + else: + self.rl_step -= 1.0 + if done: break - - self.env.reset(seed=self.rl_seed) return env_reward, step_count def mainloop_callback(self): @@ -188,10 +193,10 @@ class ActiveRL(Node): if not self.policy_sent: self.rl_step = 0 self.rl_reward = 0.0 - self.env.reset(seed=self.rl_seed) + self.env.reset() eval_request = ActiveRLEvalRequest() - eval_request.policy = self.rl_policy.tolist() + eval_request.policy = self.rl_policy.flatten().tolist() eval_request.weights = self.rl_weights self.eval_pub.publish(eval_request) @@ -200,7 +205,7 @@ class ActiveRL(Node): self.policy_sent = True - done = self.next_image(self.rl_policy, self.display_run) + done = self.step(self.rl_policy, self.display_run) if done: self.best_pol_shown = True @@ -212,18 +217,18 @@ class ActiveRL(Node): pass if self.eval_response_received: - done = self.next_image(self.eval_policy, self.display_run) + done = self.step(self.eval_policy, self.display_run) if done: rl_response = ActiveRLResponse() rl_response.weights = self.eval_weights rl_response.reward = self.rl_reward rl_response.final_step = self.rl_step - rl_response.overwrite_weight = self.overwrite_weight + rl_response.weight_preference = self.weight_preference self.active_rl_pub.publish(rl_response) - self.env.reset(seed=self.rl_seed) + self.env.reset() # reset flags and attributes self.reset_eval_request() @@ -240,10 +245,10 @@ class ActiveRL(Node): if not self.policy_sent: self.rl_step = 0 self.rl_reward = 0.0 - self.env.reset(seed=self.rl_seed) + self.env.reset() eval_request = ActiveRLEvalRequest() - eval_request.policy = self.rl_policy.tolist() + eval_request.policy = self.rl_policy.flatten().tolist() eval_request.weights = self.rl_weights self.eval_pub.publish(eval_request) @@ -252,7 +257,7 @@ class ActiveRL(Node): self.policy_sent = True - done = self.next_image(self.rl_policy, self.display_run) + done = self.step(self.rl_policy, self.display_run) if done: self.rl_step = 0 @@ -260,18 +265,18 @@ class ActiveRL(Node): self.rl_pending = False elif self.interactive_run == 2: - env_reward, step_count = self.complete_run(self.rl_policy) + env_reward, step_count = self.runner(self.rl_policy) rl_response = ActiveRLResponse() rl_response.weights = self.rl_weights rl_response.reward = env_reward rl_response.final_step = step_count - if self.overwrite_weight is None: - overwrite_weight = [False] * len(self.rl_weights) + if self.weight_preference is None: + weight_preference = [False] * len(self.rl_weights) else: - overwrite_weight = self.overwrite_weight + weight_preference = self.weight_preference - rl_response.overwrite_weight = overwrite_weight + rl_response.weight_preference = weight_preference self.active_rl_pub.publish(rl_response)