in run_offline.py [0:0]
def eval_policy(policy, env_name, seed, eval_episodes=10):
eval_env = gym.make(env_name)
eval_env.seed(seed + 100)
scores = []
for _ in range(eval_episodes):
traj_return = 0.
state, done = eval_env.reset(), False
while not done:
action = policy.sample_action(np.array(state))
state, reward, done, _ = eval_env.step(action)
traj_return += reward
scores.append(traj_return)
avg_reward = np.mean(scores)
std_reward = np.std(scores)
normalized_scores = [eval_env.get_normalized_score(s) for s in scores]
avg_norm_score = eval_env.get_normalized_score(avg_reward)
std_norm_score = np.std(normalized_scores)
utils.print_banner(f"Evaluation over {eval_episodes} episodes: {avg_reward:.2f} {avg_norm_score:.2f}")
return avg_reward, std_reward, avg_norm_score, std_norm_score