pre major rl eval change
This commit is contained in:
parent
355165e804
commit
1b9e099696
@ -1,4 +1,4 @@
|
|||||||
float32[] best_policy
|
float64[] best_policy
|
||||||
float32[] best_weights
|
float64[] best_weights
|
||||||
float32[] reward_mean
|
float64[] reward_mean
|
||||||
float32[] reward_std
|
float64[] reward_std
|
@ -1,4 +1,4 @@
|
|||||||
uint16 current_run
|
uint16 current_run
|
||||||
uint16 current_episode
|
uint16 current_episode
|
||||||
float32 best_reward
|
float64 best_reward
|
||||||
float32 last_user_reward
|
float64 last_user_reward
|
@ -1,5 +1,5 @@
|
|||||||
string env
|
string env
|
||||||
uint32 seed
|
uint32 seed
|
||||||
bool final_run
|
bool final_run
|
||||||
float32[] policy
|
float64[] policy
|
||||||
float32[] weights
|
float64[] weights
|
@ -1,3 +1,3 @@
|
|||||||
float32[] weights
|
float64[] weights
|
||||||
uint16 final_step
|
uint16 final_step
|
||||||
float32 reward
|
float64 reward
|
@ -51,7 +51,7 @@ class BayesianOptimization:
|
|||||||
for i in range(len(policy)):
|
for i in range(len(policy)):
|
||||||
action = policy[i]
|
action = policy[i]
|
||||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
output = self.env.step(action_clipped.astype(np.float32))
|
output = self.env.step(action_clipped.astype(np.float64))
|
||||||
|
|
||||||
env_reward += output[1]
|
env_reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
@ -71,8 +71,8 @@ class BayesianOptimization:
|
|||||||
self.env.reset(seed=seed)
|
self.env.reset(seed=seed)
|
||||||
self.reset_bo()
|
self.reset_bo()
|
||||||
|
|
||||||
self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float32)
|
self.X = np.zeros((self.nr_init, self.nr_policy_weights), dtype=np.float64)
|
||||||
self.Y = np.zeros((self.nr_init, 1), dtype=np.float32)
|
self.Y = np.zeros((self.nr_init, 1), dtype=np.float64)
|
||||||
|
|
||||||
for i in range(self.nr_init):
|
for i in range(self.nr_init):
|
||||||
self.policy_model.random_policy()
|
self.policy_model.random_policy()
|
||||||
@ -122,34 +122,34 @@ class BayesianOptimization:
|
|||||||
return x_next
|
return x_next
|
||||||
|
|
||||||
def eval_new_observation(self, x_next, seed=None):
|
def eval_new_observation(self, x_next, seed=None):
|
||||||
self.policy_model.weights = x_next
|
self.policy_model.weights = np.around(x_next, decimals=8)
|
||||||
policy = self.policy_model.rollout()
|
policy = self.policy_model.rollout()
|
||||||
|
|
||||||
reward, step_count = self.runner(policy, seed=seed)
|
reward, step_count = self.runner(policy, seed=seed)
|
||||||
|
|
||||||
self.X = np.vstack((self.X, x_next), dtype=np.float32)
|
self.X = np.vstack((self.X, np.around(x_next, decimals=8)), dtype=np.float64)
|
||||||
self.Y = np.vstack((self.Y, reward), dtype=np.float32)
|
self.Y = np.vstack((self.Y, reward), dtype=np.float64)
|
||||||
|
|
||||||
self.GP.fit(self.X, self.Y)
|
self.GP.fit(self.X, self.Y)
|
||||||
|
|
||||||
if self.episode == 0:
|
if self.episode == 0:
|
||||||
self.best_reward[0] = np.max(self.Y)
|
self.best_reward[0] = np.max(self.Y)
|
||||||
else:
|
else:
|
||||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
|
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
|
||||||
|
|
||||||
self.episode += 1
|
self.episode += 1
|
||||||
return step_count
|
return step_count
|
||||||
|
|
||||||
def add_new_observation(self, reward, x_new):
|
def add_new_observation(self, reward, x_new):
|
||||||
self.X = np.vstack((self.X, x_new), dtype=np.float32)
|
self.X = np.vstack((self.X, np.around(x_new, decimals=8)), dtype=np.float64)
|
||||||
self.Y = np.vstack((self.Y, reward), dtype=np.float32)
|
self.Y = np.vstack((self.Y, reward), dtype=np.float64)
|
||||||
|
|
||||||
self.GP.fit(self.X, self.Y)
|
self.GP.fit(self.X, self.Y)
|
||||||
|
|
||||||
if self.episode == 0:
|
if self.episode == 0:
|
||||||
self.best_reward[0] = np.max(self.Y)
|
self.best_reward[0] = np.max(self.Y)
|
||||||
else:
|
else:
|
||||||
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float32)
|
self.best_reward = np.vstack((self.best_reward, np.max(self.Y)), dtype=np.float64)
|
||||||
|
|
||||||
self.episode += 1
|
self.episode += 1
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ class GaussianRBF:
|
|||||||
self.policy = np.zeros((self.nr_steps, 1))
|
self.policy = np.zeros((self.nr_steps, 1))
|
||||||
|
|
||||||
def random_policy(self):
|
def random_policy(self):
|
||||||
self.weights = self.rng.uniform(self.low, self.upper, self.nr_weights)
|
self.weights = np.around(self.rng.uniform(self.low, self.upper, self.nr_weights), decimals=8)
|
||||||
|
|
||||||
def rollout(self):
|
def rollout(self):
|
||||||
self.policy = np.zeros((self.nr_steps, 1))
|
self.policy = np.zeros((self.nr_steps, 1))
|
||||||
|
@ -101,15 +101,15 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
high = np.array(
|
high = np.array(
|
||||||
[
|
[
|
||||||
self.x_threshold * 2,
|
self.x_threshold * 2,
|
||||||
np.finfo(np.float32).max,
|
np.finfo(np.float64).max,
|
||||||
self.theta_threshold_radians * 2,
|
self.theta_threshold_radians * 2,
|
||||||
np.finfo(np.float32).max,
|
np.finfo(np.float64).max,
|
||||||
],
|
],
|
||||||
dtype=np.float32,
|
dtype=np.float64,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
|
self.action_space = Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)
|
||||||
self.observation_space = spaces.Box(-high, high, dtype=np.float32)
|
self.observation_space = spaces.Box(-high, high, dtype=np.float64)
|
||||||
|
|
||||||
self.render_mode = render_mode
|
self.render_mode = render_mode
|
||||||
|
|
||||||
@ -185,7 +185,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
if self.render_mode == "human":
|
if self.render_mode == "human":
|
||||||
self.render()
|
self.render()
|
||||||
|
|
||||||
return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
|
return np.array(self.state, dtype=np.float64), reward, terminated, False, {}
|
||||||
|
|
||||||
def reset(
|
def reset(
|
||||||
self,
|
self,
|
||||||
@ -204,7 +204,7 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
|
|||||||
|
|
||||||
if self.render_mode == "human":
|
if self.render_mode == "human":
|
||||||
self.render()
|
self.render()
|
||||||
return np.array(self.state, dtype=np.float32), {}
|
return np.array(self.state, dtype=np.float64), {}
|
||||||
|
|
||||||
def render(self):
|
def render(self):
|
||||||
if self.render_mode is None:
|
if self.render_mode is None:
|
||||||
|
@ -86,7 +86,7 @@ class ActiveRLService(Node):
|
|||||||
def active_rl_callback(self, msg):
|
def active_rl_callback(self, msg):
|
||||||
self.rl_env = msg.env
|
self.rl_env = msg.env
|
||||||
self.rl_seed = msg.seed
|
self.rl_seed = msg.seed
|
||||||
self.rl_policy = np.array(msg.policy, dtype=np.float32)
|
self.rl_policy = np.array(msg.policy, dtype=np.float64)
|
||||||
self.rl_weights = msg.weights
|
self.rl_weights = msg.weights
|
||||||
self.final_run = msg.final_run
|
self.final_run = msg.final_run
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ class ActiveRLService(Node):
|
|||||||
self.eval_weights = None
|
self.eval_weights = None
|
||||||
|
|
||||||
def active_rl_eval_callback(self, msg):
|
def active_rl_eval_callback(self, msg):
|
||||||
self.eval_policy = np.array(msg.policy, dtype=np.float32)
|
self.eval_policy = np.array(msg.policy, dtype=np.float64)
|
||||||
self.eval_weights = msg.weights
|
self.eval_weights = msg.weights
|
||||||
|
|
||||||
self.get_logger().info('Active RL Eval: Responded!')
|
self.get_logger().info('Active RL Eval: Responded!')
|
||||||
@ -122,7 +122,7 @@ class ActiveRLService(Node):
|
|||||||
def next_image(self, policy):
|
def next_image(self, policy):
|
||||||
action = policy[self.rl_step]
|
action = policy[self.rl_step]
|
||||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
output = self.env.step(action_clipped.astype(np.float32))
|
output = self.env.step(action_clipped.astype(np.float64))
|
||||||
|
|
||||||
self.rl_reward += output[1]
|
self.rl_reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
|
@ -27,7 +27,7 @@ class RLService(Node):
|
|||||||
|
|
||||||
reward = 0
|
reward = 0
|
||||||
step_count = 0
|
step_count = 0
|
||||||
policy = np.array(request.policy, dtype=np.float32)
|
policy = np.array(request.policy, dtype=np.float64)
|
||||||
rl_env = request.env
|
rl_env = request.env
|
||||||
|
|
||||||
if rl_env == "Mountain Car":
|
if rl_env == "Mountain Car":
|
||||||
@ -46,7 +46,7 @@ class RLService(Node):
|
|||||||
for i in range(len(policy)):
|
for i in range(len(policy)):
|
||||||
action = policy[i]
|
action = policy[i]
|
||||||
action_clipped = action.clip(min=-1.0, max=1.0)
|
action_clipped = action.clip(min=-1.0, max=1.0)
|
||||||
output = self.env.step(action_clipped.astype(np.float32))
|
output = self.env.step(action_clipped.astype(np.float64))
|
||||||
|
|
||||||
reward += output[1]
|
reward += output[1]
|
||||||
done = output[2]
|
done = output[2]
|
||||||
|
Loading…
Reference in New Issue
Block a user