pre major rl eval change
This commit is contained in:
parent
355165e804
commit
1b9e099696
@ -1,4 +1,4 @@
|
||||
float32[] best_policy
|
||||
float32[] best_weights
|
||||
float32[] reward_mean
|
||||
float32[] reward_std
|
||||
float64[] best_policy
|
||||
float64[] best_weights
|
||||
float64[] reward_mean
|
||||
float64[] reward_std
|
@ -1,4 +1,4 @@
|
||||
uint16 current_run
|
||||
uint16 current_episode
|
||||
float32 best_reward
|
||||
float32 last_user_reward
|
||||
float64 best_reward
|
||||
float64 last_user_reward
|
@ -1,5 +1,5 @@
|
||||
string env
|
||||
uint32 seed
|
||||
bool final_run
|
||||
float32[] policy
|
||||
float32[] weights
|
||||
float64[] policy
|
||||
float64[] weights
|
@ -1,3 +1,3 @@
|
||||
float32[] weights
|
||||
float64[] weights
|
||||
uint16 final_step
|
||||
float32 reward
|
||||
float64 reward
|
@ -51,7 +51,7 @@ class BayesianOptimization:
|
||||
for i in range(len(policy)):
|
||||
action = policy[i]
|
||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||
output = self.env.step(action_clipped.astype(np.float32))
|
||||
output = self.env.step(action_clipped.astype(np.float64))
|
||||
|
||||
env_reward += output[1]
|
||||
done = output[2]
|
||||
@ -71,8 +71,8 @@ class BayesianOptimization:
|
||||
self.env.reset(seed=seed)
|
||||
self.reset_bo()
|
||||
|
||||
self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32)
|
||||
self.Y = np.zeros((self.nr_init, 1), dtype=np.float32)
|
||||
self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64)
|
||||
self.Y = np.zeros((self.nr_init, 1), dtype=np.float64)
|
||||
|
||||
for i in range(self.nr_init):
|
||||
self.policy_model.random_policy()
|
||||
@ -122,34 +122,34 @@ class BayesianOptimization:
|
||||
return x_next
|
||||
|
||||
def eval_new_observation(self, x_next, seed=None):
|
||||
self.policy_model.weights = x_next
|
||||
self.policy_model.weights = np.around(x_next, decimals=8)
|
||||
policy = self.policy_model.rollout()
|
||||
|
||||
reward, step_count = self.runner(policy, seed=seed)
|
||||
|
||||
self.X = np.vstack((self.X, x_next), dtype=np.float32)
|
||||
self.Y = np.vstack((self.Y, reward), dtype=np.float32)
|
||||
self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64)
|
||||
self.Y = np.vstack((self.Y, reward), dtype=np.float64)
|
||||
|
||||
self.GP.fit(self.X, self.Y)
|
||||
|
||||
if self.episode == 0:
|
||||
self.best_reward[0] = np.max(self.Y)
|
||||
else:
|
||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
|
||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
|
||||
|
||||
self.episode += 1
|
||||
return step_count
|
||||
|
||||
def add_new_observation(self, reward, x_new):
|
||||
self.X = np.vstack((self.X, x_new), dtype=np.float32)
|
||||
self.Y = np.vstack((self.Y, reward), dtype=np.float32)
|
||||
self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64)
|
||||
self.Y = np.vstack((self.Y, reward), dtype=np.float64)
|
||||
|
||||
self.GP.fit(self.X, self.Y)
|
||||
|
||||
if self.episode == 0:
|
||||
self.best_reward[0] = np.max(self.Y)
|
||||
else:
|
||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
|
||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
|
||||
|
||||
self.episode += 1
|
||||
|
||||
|
@ -24,7 +24,7 @@ class GaussianRBF:
|
||||
self.policy = np.zeros((self.nr_steps, 1))
|
||||
|
||||
def random_policy(self):
|
||||
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
|
||||
self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8)
|
||||
|
||||
def rollout(self):
|
||||
self.policy = np.zeros((self.nr_steps, 1))
|
||||
|
@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
||||
high = np.array(
|
||||
[
|
||||
self.x_threshold * 2,
|
||||
np.finfo(np.float32).max,
|
||||
np.finfo(np.float64).max,
|
||||
self.theta_threshold_radians * 2,
|
||||
np.finfo(np.float32).max,
|
||||
np.finfo(np.float64).max,
|
||||
],
|
||||
dtype=np.float32,
|
||||
dtype=np.float64,
|
||||
)
|
||||
|
||||
self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
|
||||
self.observation_space = spaces.Box(-high, high, dtype=np.float32)
|
||||
self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)
|
||||
self.observation_space = spaces.Box(-high, high, dtype=np.float64)
|
||||
|
||||
self.render_mode = render_mode
|
||||
|
||||
@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
||||
if self.render_mode == "human":
|
||||
self.render()
|
||||
|
||||
return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
|
||||
return np.array(self.state, dtype=np.float64), reward, terminated, False, {}
|
||||
|
||||
def reset(
|
||||
self,
|
||||
@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
||||
|
||||
if self.render_mode == "human":
|
||||
self.render()
|
||||
return np.array(self.state, dtype=np.float32), {}
|
||||
return np.array(self.state, dtype=np.float64), {}
|
||||
|
||||
def render(self):
|
||||
if self.render_mode is None:
|
||||
|
@ -329,13 +329,13 @@ class ActiveBOTopic(Node):
|
||||
if self.BO is not None and self.BO.Y is not None:
|
||||
self.best_reward = np.max(self.BO.Y)
|
||||
|
||||
state_msg = ActiveBOState()
|
||||
state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
|
||||
state_msg.current_episode = self.current_episode + 1 \
|
||||
if self.current_episode < self.bo_episodes else self.bo_episodes
|
||||
state_msg.best_reward = float(self.best_reward)
|
||||
state_msg.last_user_reward = float(self.rl_reward)
|
||||
self.state_pub.publish(state_msg)
|
||||
state_msg = ActiveBOState()
|
||||
state_msg.current_run = self.current_run + 1 if self.current_run < self.bo_runs else self.bo_runs
|
||||
state_msg.current_episode = self.current_episode + 1 \
|
||||
if self.current_episode < self.bo_episodes else self.bo_episodes
|
||||
state_msg.best_reward = float(self.best_reward)
|
||||
state_msg.last_user_reward = float(self.rl_reward)
|
||||
self.state_pub.publish(state_msg)
|
||||
|
||||
|
||||
def main(args=None):
|
||||
|
@ -86,7 +86,7 @@ class ActiveRLService(Node):
|
||||
def active_rl_callback(self, msg):
|
||||
self.rl_env = msg.env
|
||||
self.rl_seed = msg.seed
|
||||
self.rl_policy = np.array(msg.policy, dtype=np.float32)
|
||||
self.rl_policy = np.array(msg.policy, dtype=np.float64)
|
||||
self.rl_weights = msg.weights
|
||||
self.final_run = msg.final_run
|
||||
|
||||
@ -112,7 +112,7 @@ class ActiveRLService(Node):
|
||||
self.eval_weights = None
|
||||
|
||||
def active_rl_eval_callback(self, msg):
|
||||
self.eval_policy = np.array(msg.policy, dtype=np.float32)
|
||||
self.eval_policy = np.array(msg.policy, dtype=np.float64)
|
||||
self.eval_weights = msg.weights
|
||||
|
||||
self.get_logger().info('Active RL Eval: Responded!')
|
||||
@ -122,7 +122,7 @@ class ActiveRLService(Node):
|
||||
def next_image(self, policy):
|
||||
action = policy[self.rl_step]
|
||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||
output = self.env.step(action_clipped.astype(np.float32))
|
||||
output = self.env.step(action_clipped.astype(np.float64))
|
||||
|
||||
self.rl_reward += output[1]
|
||||
done = output[2]
|
||||
|
@ -27,7 +27,7 @@ class RLService(Node):
|
||||
|
||||
reward = 0
|
||||
step_count = 0
|
||||
policy = np.array(request.policy, dtype=np.float32)
|
||||
policy = np.array(request.policy, dtype=np.float64)
|
||||
rl_env = request.env
|
||||
|
||||
if rl_env == "Mountain Car":
|
||||
@ -46,7 +46,7 @@ class RLService(Node):
|
||||
for i in range(len(policy)):
|
||||
action = policy[i]
|
||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||
output = self.env.step(action_clipped.astype(np.float32))
|
||||
output = self.env.step(action_clipped.astype(np.float64))
|
||||
|
||||
reward += output[1]
|
||||
done = output[2]
|
||||
|
Loading…
Reference in New Issue
Block a user