in run_offline.py [0:0]
def train_agent(env, state_dim, action_dim, max_action, device, output_dir, args):
# Load buffer
dataset = d4rl.qlearning_dataset(env)
data_sampler = Data_Sampler(dataset, device, args.reward_tune)
utils.print_banner('Loaded buffer')
if args.algo == 'bc':
from agents.bc_diffusion import BC as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
beta_schedule=args.beta_schedule,
n_timesteps=args.T,
model_type=args.model,
lr=args.lr)
elif args.algo == 'pcq':
from agents.ql_diffusion import PCQ as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
max_q_backup=args.max_q_backup,
beta_schedule=args.beta_schedule,
n_timesteps=args.T,
model_type=args.model,
eta=args.eta,
lr=args.lr,
lr_decay=args.lr_decay,
lr_maxt=args.num_epochs * args.num_steps_per_epoch,
mode=args.mode)
elif args.algo == 'ql_cvae':
from agents.ql_cvae import QL_CVAE as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
max_q_backup=args.max_q_backup,
eta=args.eta,
lr=args.lr)
elif args.algo == 'ed_pcq':
from agents.ed_pcq import ED_PCQ as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
max_q_backup=args.max_q_backup,
beta_schedule=args.beta_schedule,
n_timesteps=args.T,
model_type=args.model,
eta=args.eta,
lr=args.lr,
num_qs=20,
q_eta=1.0)
elif args.algo == 'adw_bc':
from agents.adw_bc_diffusion import ADW_BC as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
beta_schedule=args.beta_schedule,
n_timesteps=args.T,
model_type=args.model,
quantile=args.quantile,
temp=args.temp,
lr=args.lr)
elif args.algo == 'qgdp':
from agents.qgdp import QGDP as Agent
agent = Agent(state_dim=state_dim,
action_dim=action_dim,
max_action=max_action,
device=device,
discount=args.discount,
tau=args.tau,
beta_schedule=args.beta_schedule,
n_timesteps=args.T,
model_type=args.model,
quantile=args.quantile)
evaluations = []
training_iters = 0
max_timesteps = args.num_epochs * args.num_steps_per_epoch
best_score = -100.
while training_iters < max_timesteps:
iterations = int(args.eval_freq * args.num_steps_per_epoch)
utils.print_banner(f"Train step: {training_iters}", separator="*", num_star=90)
agent.train(data_sampler,
iterations=iterations,
batch_size=args.batch_size)
training_iters += iterations
curr_epoch = int(training_iters // int(args.num_steps_per_epoch))
eval_res, eval_res_std, eval_norm_res, eval_norm_res_std = eval_policy(agent, args.env_name, args.seed,
eval_episodes=args.eval_episodes)
evaluations.append([eval_res, eval_res_std, eval_norm_res, eval_norm_res_std])
np.save(os.path.join(output_dir, "eval"), evaluations)
# record and save the best model
if eval_norm_res >= best_score:
if args.save_best_model: agent.save_model(output_dir)
best_score = eval_norm_res
best_res = {'epoch': curr_epoch, 'best normalized score avg': eval_norm_res,
'best normalized score std': eval_norm_res_std,
'best raw score avg': eval_res, 'best raw score std': eval_res_std}
with open(os.path.join(output_dir, "best_score.txt"), 'w') as f:
f.write(json.dumps(best_res))