agents/adw_bc_diffusion.py [109:124]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            with torch.no_grad():
                q1, q2 = self.critic_target(state, action)
                q = torch.min(q1, q2)  # Clipped Double Q-learning
            v = self.value_fun(state)
            value_loss = expectile_reg_loss(q - v, self.quantile).mean()
            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()

            # Critic Training
            current_q1, current_q2 = self.critic(state, action)
            target_q = (reward + not_done * self.discount * self.value_fun(next_state)).detach()
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


agents/qgdp.py [124:139]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            with torch.no_grad():
                q1, q2 = self.critic_target(state, action)
                q = torch.min(q1, q2)  # Clipped Double Q-learning
            v = self.value_fun(state)
            value_loss = expectile_reg_loss(q - v, self.quantile).mean()
            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()

            # Critic Training
            current_q1, current_q2 = self.critic(state, action)
            target_q = (reward + not_done * self.discount * self.value_fun(next_state)).detach()
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -