pre major rl eval change

This commit is contained in:
Niko Feith 2023-06-07 11:59:17 +02:00
parent 355165e804
commit 1b9e099696
11 changed files with 40 additions and 40 deletions

View File

@ -1,4 +1,4 @@
float32[] best_policy float64[] best_policy
float32[] best_weights float64[] best_weights
float32[] reward_mean float64[] reward_mean
float32[] reward_std float64[] reward_std

View File

@ -1,4 +1,4 @@
uint16 current_run uint16 current_run
uint16 current_episode uint16 current_episode
float32 best_reward float64 best_reward
float32 last_user_reward float64 last_user_reward

View File

@ -1,5 +1,5 @@
string env string env
uint32 seed uint32 seed
bool final_run bool final_run
float32[] policy float64[] policy
float32[] weights float64[] weights

View File

@ -1,3 +1,3 @@
float32[] weights float64[] weights
uint16 final_step uint16 final_step
float32 reward float64 reward

View File

@ -51,7 +51,7 @@ class BayesianOptimization:
for i in range(len(policy)): for i in range(len(policy)):
action = policy[i] action = policy[i]
action_clipped = action.clip(min=-1.0, max=1.0) action_clipped = action.clip(min=-1.0, max=1.0)
output = self.env.step(action_clipped.astype(np.float32)) output = self.env.step(action_clipped.astype(np.float64))
env_reward += output[1] env_reward += output[1]
done = output[2] done = output[2]
@ -71,8 +71,8 @@ class BayesianOptimization:
self.env.reset(seed=seed) self.env.reset(seed=seed)
self.reset_bo() self.reset_bo()
self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32) self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64)
self.Y = np.zeros((self.nr_init, 1), dtype=np.float32) self.Y = np.zeros((self.nr_init, 1), dtype=np.float64)
for i in range(self.nr_init): for i in range(self.nr_init):
self.policy_model.random_policy() self.policy_model.random_policy()
@ -122,34 +122,34 @@ class BayesianOptimization:
return x_next return x_next
def eval_new_observation(self, x_next, seed=None): def eval_new_observation(self, x_next, seed=None):
self.policy_model.weights = x_next self.policy_model.weights = np.around(x_next, decimals=8)
policy = self.policy_model.rollout() policy = self.policy_model.rollout()
reward, step_count = self.runner(policy, seed=seed) reward, step_count = self.runner(policy, seed=seed)
self.X = np.vstack((self.X, x_next), dtype=np.float32) self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64)
self.Y = np.vstack((self.Y, reward), dtype=np.float32) self.Y = np.vstack((self.Y, reward), dtype=np.float64)
self.GP.fit(self.X, self.Y) self.GP.fit(self.X, self.Y)
if self.episode == 0: if self.episode == 0:
self.best_reward[0] = np.max(self.Y) self.best_reward[0] = np.max(self.Y)
else: else:
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32) self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
self.episode += 1 self.episode += 1
return step_count return step_count
def add_new_observation(self, reward, x_new): def add_new_observation(self, reward, x_new):
self.X = np.vstack((self.X, x_new), dtype=np.float32) self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64)
self.Y = np.vstack((self.Y, reward), dtype=np.float32) self.Y = np.vstack((self.Y, reward), dtype=np.float64)
self.GP.fit(self.X, self.Y) self.GP.fit(self.X, self.Y)
if self.episode == 0: if self.episode == 0:
self.best_reward[0] = np.max(self.Y) self.best_reward[0] = np.max(self.Y)
else: else:
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32) self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
self.episode += 1 self.episode += 1

View File

@ -24,7 +24,7 @@ class GaussianRBF:
self.policy = np.zeros((self.nr_steps, 1)) self.policy = np.zeros((self.nr_steps, 1))
def random_policy(self): def random_policy(self):
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights) self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8)
def rollout(self): def rollout(self):
self.policy = np.zeros((self.nr_steps, 1)) self.policy = np.zeros((self.nr_steps, 1))

View File

@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
high = np.array( high = np.array(
[ [
self.x_threshold * 2, self.x_threshold * 2,
np.finfo(np.float32).max, np.finfo(np.float64).max,
self.theta_threshold_radians * 2, self.theta_threshold_radians * 2,
np.finfo(np.float32).max, np.finfo(np.float64).max,
], ],
dtype=np.float32, dtype=np.float64,
) )
self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32) self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)
self.observation_space = spaces.Box(-high, high, dtype=np.float32) self.observation_space = spaces.Box(-high, high, dtype=np.float64)
self.render_mode = render_mode self.render_mode = render_mode
@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
if self.render_mode == "human": if self.render_mode == "human":
self.render() self.render()
return np.array(self.state, dtype=np.float32), reward, terminated, False, {} return np.array(self.state, dtype=np.float64), reward, terminated, False, {}
def reset( def reset(
self, self,
@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
if self.render_mode == "human": if self.render_mode == "human":
self.render() self.render()
return np.array(self.state, dtype=np.float32), {} return np.array(self.state, dtype=np.float64), {}
def render(self): def render(self):
if self.render_mode is None: if self.render_mode is None:

View File

@ -329,13 +329,13 @@ class ActiveBOTopic(Node):
if self.BO is not None and self.BO.Y is not None: if self.BO is not None and self.BO.Y is not None:
self.best_reward = np.max(self.BO.Y) self.best_reward = np.max(self.BO.Y)
state_msg = ActiveBOState() state_msg = ActiveBOState()
state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
state_msg.current_episode = self.current_episode + 1 \ state_msg.current_episode = self.current_episode + 1 \
if self.current_episode < self.bo_episodes else self.bo_episodes if self.current_episode < self.bo_episodes else self.bo_episodes
state_msg.best_reward = float(self.best_reward) state_msg.best_reward = float(self.best_reward)
state_msg.last_user_reward = float(self.rl_reward) state_msg.last_user_reward = float(self.rl_reward)
self.state_pub.publish(state_msg) self.state_pub.publish(state_msg)
def main(args=None): def main(args=None):

View File

@ -86,7 +86,7 @@ class ActiveRLService(Node):
def active_rl_callback(self, msg): def active_rl_callback(self, msg):
self.rl_env = msg.env self.rl_env = msg.env
self.rl_seed = msg.seed self.rl_seed = msg.seed
self.rl_policy = np.array(msg.policy, dtype=np.float32) self.rl_policy = np.array(msg.policy, dtype=np.float64)
self.rl_weights = msg.weights self.rl_weights = msg.weights
self.final_run = msg.final_run self.final_run = msg.final_run
@ -112,7 +112,7 @@ class ActiveRLService(Node):
self.eval_weights = None self.eval_weights = None
def active_rl_eval_callback(self, msg): def active_rl_eval_callback(self, msg):
self.eval_policy = np.array(msg.policy, dtype=np.float32) self.eval_policy = np.array(msg.policy, dtype=np.float64)
self.eval_weights = msg.weights self.eval_weights = msg.weights
self.get_logger().info('Active RL Eval: Responded!') self.get_logger().info('Active RL Eval: Responded!')
@ -122,7 +122,7 @@ class ActiveRLService(Node):
def next_image(self, policy): def next_image(self, policy):
action = policy[self.rl_step] action = policy[self.rl_step]
action_clipped = action.clip(min=-1.0, max=1.0) action_clipped = action.clip(min=-1.0, max=1.0)
output = self.env.step(action_clipped.astype(np.float32)) output = self.env.step(action_clipped.astype(np.float64))
self.rl_reward += output[1] self.rl_reward += output[1]
done = output[2] done = output[2]

View File

@ -27,7 +27,7 @@ class RLService(Node):
reward = 0 reward = 0
step_count = 0 step_count = 0
policy = np.array(request.policy, dtype=np.float32) policy = np.array(request.policy, dtype=np.float64)
rl_env = request.env rl_env = request.env
if rl_env == "Mountain Car": if rl_env == "Mountain Car":
@ -46,7 +46,7 @@ class RLService(Node):
for i in range(len(policy)): for i in range(len(policy)):
action = policy[i] action = policy[i]
action_clipped = action.clip(min=-1.0, max=1.0) action_clipped = action.clip(min=-1.0, max=1.0)
output = self.env.step(action_clipped.astype(np.float32)) output = self.env.step(action_clipped.astype(np.float64))
reward += output[1] reward += output[1]
done = output[2] done = output[2]