uint16 nr_weights uint16 max_steps uint16 nr_episodes uint16 nr_runs string acquisition_function float32 epsilon --- float32[] best_policy float32[] best_weights float32[] reward_mean float32[] reward_std