Module ilpyt.agents.a2c_agent

An implementation of the agent from the Advantage Actor Critic (A2C) algorithm. This algorithm was described in the paper "Asynchronous Methods for Deep Reinforcement Learning" by Volodymyr Mnih, Adrià Puigdomènech Badia, Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu, and presented at ICML 2016.

For more details, please refer to the paper: https://arxiv.org/abs/1602.01783

Expand source code
"""
An implementation of the agent from the Advantage Actor Critic (A2C) algorithm. 
This algorithm was described in the paper "Asynchronous Methods for Deep 
Reinforcement Learning" by Volodymyr Mnih, Adrià Puigdomènech Badia, Mehdi 
Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, and Koray 
Kavukcuoglu, and presented at ICML 2016. 

For more details, please refer to the paper: https://arxiv.org/abs/1602.01783
"""

from typing import Dict, Union

import numpy as np
import torch
import torch.nn.functional as F
from torch.optim import Adam

from ilpyt.agents.base_agent import BaseAgent
from ilpyt.nets.base_net import BaseNetwork
from ilpyt.utils.agent_utils import compute_target, flatten_batch

__pdoc__ = {"__init__": False}


class A2CAgent(BaseAgent):
    def initialize(
        self,
        actor: Union[BaseNetwork, None] = None,
        critic: Union[BaseNetwork, None] = None,
        lr: float = 0.0001,
        gamma: float = 0.99,
        entropy_coeff: float = 0.01,
    ) -> None:
        """
        Initialization function for the A2C agent.

        Parameters
        ----------
        actor: BaseNetwork, default=None
            actor network
        critic: BaseNetwork, default=None
            critic network
        lr: float, default=0.0001
            learning rate for training
        gamma: float, default=0.99
            discount factor used to compute targets
        entropy_coeff: float, default=0.01
            weight factor for entropy loss term

        Raises
        ------
        ValueError:
            if `actor` or `critic` are not specified
        """
        self.gamma = gamma
        self.entropy_coeff = entropy_coeff
        self.lr = lr

        # Networks
        if actor is None or critic is None:
            raise ValueError(
                'Please provide BaseNetwork for actor and critic. Currently set to None.'
            )
        self.actor = actor
        self.critic = critic
        self.nets = {'actor': self.actor, 'critic': self.critic}

        self.opt_actor = Adam(self.actor.parameters(), self.lr)
        self.opt_critic = Adam(self.critic.parameters(), self.lr)

    @torch.no_grad()
    def reset(self) -> None:
        """
        Reset the network weights and optimizers.
        """
        # Reset actor weights
        for layers in self.actor.children():
            for layer in layers:
                if hasattr(layer, 'reset_parameters'):
                    layer.reset_parameters()

        # Reset critic weights
        for layers in self.critic.children():
            for layer in layers:
                if hasattr(layer, 'reset_parameters'):
                    layer.reset_parameters()

        # Reset optimizers
        self.opt_actor = Adam(self.actor.parameters(), self.lr)
        self.opt_critic = Adam(self.critic.parameters(), self.lr)

    @torch.no_grad()
    def step(self, state: torch.Tensor) -> np.ndarray:
        """
        Find best action for the given state according to the current policy.

        Parameters
        ----------
        state: torch.Tensor
            state tensor, of size (batch_size, state_shape)

        Returns
        -------
        np.ndarray:
            selected actions, of size (batch_size, action_shape)
        """
        _, actions = self.actor.get_action(state)

        if self.device == 'gpu':
            actions = actions.cpu().numpy()
        else:
            actions = actions.numpy()
        return actions

    def update(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
        """
        Update agent policy based on batch of experiences.

        Parameters
        ----------
        batch: Dict[str, torch.Tensor]
            batch of transitions, with keys `states`, `actions`, `rewards`, 
            `dones`, and `next_states`. Values should be of size (num_steps, 
            num_env, item_shape)

        Returns
        -------
        Dict[str, float]:
            losses for the update step, key strings and loss values can be 
            automatically recorded to TensorBoard
        """
        # Compute values used for losses
        final_states = batch['next_states'][-1]
        value_final = self.critic(final_states).squeeze()
        targets = compute_target(
            value_final, batch['rewards'], 1 - batch['dones'], self.gamma
        ).reshape(-1)
        if self.device == 'gpu':
            targets = targets.cuda()

        batch = flatten_batch(batch)
        values = self.critic(batch['states']).squeeze()
        advantages = targets - values

        dist, _ = self.actor.get_action(batch['states'])
        actions = batch['actions']
        log_action_probs = dist.log_prob(actions)
        if len(log_action_probs.shape) > 1:
            log_action_probs = log_action_probs.sum(axis=-1)

        # Compute losses
        loss_action = -(log_action_probs * advantages.detach()).mean()
        loss_entropy = self.entropy_coeff * dist.entropy().mean()
        loss_actor = loss_action - loss_entropy
        loss_critic = F.smooth_l1_loss(values, targets)

        # Updates
        self.opt_actor.zero_grad()
        self.opt_critic.zero_grad()
        loss_actor.backward()
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(
            self.actor.parameters(), 5
        )
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 5)
        self.opt_actor.step()
        self.opt_critic.step()

        # Return loss dictionary
        loss_dict = {
            'loss/actor': loss_actor.item(),
            'loss/critic': loss_critic.item(),
            'loss/total': loss_actor.item() + loss_critic.item(),
        }
        return loss_dict

Classes

class A2CAgent (**kwargs)

By default, the agent will be in train mode and be configured to use the cpu for step and update calls.

Parameters

**kwargs: arbitrary keyword arguments that will be passed to the initialize function

Expand source code
class A2CAgent(BaseAgent):
    def initialize(
        self,
        actor: Union[BaseNetwork, None] = None,
        critic: Union[BaseNetwork, None] = None,
        lr: float = 0.0001,
        gamma: float = 0.99,
        entropy_coeff: float = 0.01,
    ) -> None:
        """
        Initialization function for the A2C agent.

        Parameters
        ----------
        actor: BaseNetwork, default=None
            actor network
        critic: BaseNetwork, default=None
            critic network
        lr: float, default=0.0001
            learning rate for training
        gamma: float, default=0.99
            discount factor used to compute targets
        entropy_coeff: float, default=0.01
            weight factor for entropy loss term

        Raises
        ------
        ValueError:
            if `actor` or `critic` are not specified
        """
        self.gamma = gamma
        self.entropy_coeff = entropy_coeff
        self.lr = lr

        # Networks
        if actor is None or critic is None:
            raise ValueError(
                'Please provide BaseNetwork for actor and critic. Currently set to None.'
            )
        self.actor = actor
        self.critic = critic
        self.nets = {'actor': self.actor, 'critic': self.critic}

        self.opt_actor = Adam(self.actor.parameters(), self.lr)
        self.opt_critic = Adam(self.critic.parameters(), self.lr)

    @torch.no_grad()
    def reset(self) -> None:
        """
        Reset the network weights and optimizers.
        """
        # Reset actor weights
        for layers in self.actor.children():
            for layer in layers:
                if hasattr(layer, 'reset_parameters'):
                    layer.reset_parameters()

        # Reset critic weights
        for layers in self.critic.children():
            for layer in layers:
                if hasattr(layer, 'reset_parameters'):
                    layer.reset_parameters()

        # Reset optimizers
        self.opt_actor = Adam(self.actor.parameters(), self.lr)
        self.opt_critic = Adam(self.critic.parameters(), self.lr)

    @torch.no_grad()
    def step(self, state: torch.Tensor) -> np.ndarray:
        """
        Find best action for the given state according to the current policy.

        Parameters
        ----------
        state: torch.Tensor
            state tensor, of size (batch_size, state_shape)

        Returns
        -------
        np.ndarray:
            selected actions, of size (batch_size, action_shape)
        """
        _, actions = self.actor.get_action(state)

        if self.device == 'gpu':
            actions = actions.cpu().numpy()
        else:
            actions = actions.numpy()
        return actions

    def update(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
        """
        Update agent policy based on batch of experiences.

        Parameters
        ----------
        batch: Dict[str, torch.Tensor]
            batch of transitions, with keys `states`, `actions`, `rewards`, 
            `dones`, and `next_states`. Values should be of size (num_steps, 
            num_env, item_shape)

        Returns
        -------
        Dict[str, float]:
            losses for the update step, key strings and loss values can be 
            automatically recorded to TensorBoard
        """
        # Compute values used for losses
        final_states = batch['next_states'][-1]
        value_final = self.critic(final_states).squeeze()
        targets = compute_target(
            value_final, batch['rewards'], 1 - batch['dones'], self.gamma
        ).reshape(-1)
        if self.device == 'gpu':
            targets = targets.cuda()

        batch = flatten_batch(batch)
        values = self.critic(batch['states']).squeeze()
        advantages = targets - values

        dist, _ = self.actor.get_action(batch['states'])
        actions = batch['actions']
        log_action_probs = dist.log_prob(actions)
        if len(log_action_probs.shape) > 1:
            log_action_probs = log_action_probs.sum(axis=-1)

        # Compute losses
        loss_action = -(log_action_probs * advantages.detach()).mean()
        loss_entropy = self.entropy_coeff * dist.entropy().mean()
        loss_actor = loss_action - loss_entropy
        loss_critic = F.smooth_l1_loss(values, targets)

        # Updates
        self.opt_actor.zero_grad()
        self.opt_critic.zero_grad()
        loss_actor.backward()
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(
            self.actor.parameters(), 5
        )
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 5)
        self.opt_actor.step()
        self.opt_critic.step()

        # Return loss dictionary
        loss_dict = {
            'loss/actor': loss_actor.item(),
            'loss/critic': loss_critic.item(),
            'loss/total': loss_actor.item() + loss_critic.item(),
        }
        return loss_dict

Ancestors

Methods

def initialize(self, actor: Union[BaseNetwork, NoneType] = None, critic: Union[BaseNetwork, NoneType] = None, lr: float = 0.0001, gamma: float = 0.99, entropy_coeff: float = 0.01) ‑> NoneType

Initialization function for the A2C agent.

Parameters

actor : BaseNetwork, default=None
actor network
critic : BaseNetwork, default=None
critic network
lr : float, default=0.0001
learning rate for training
gamma : float, default=0.99
discount factor used to compute targets
entropy_coeff : float, default=0.01
weight factor for entropy loss term

Raises

Valueerror

if actor or critic are not specified

Expand source code
def initialize(
    self,
    actor: Union[BaseNetwork, None] = None,
    critic: Union[BaseNetwork, None] = None,
    lr: float = 0.0001,
    gamma: float = 0.99,
    entropy_coeff: float = 0.01,
) -> None:
    """
    Initialization function for the A2C agent.

    Parameters
    ----------
    actor: BaseNetwork, default=None
        actor network
    critic: BaseNetwork, default=None
        critic network
    lr: float, default=0.0001
        learning rate for training
    gamma: float, default=0.99
        discount factor used to compute targets
    entropy_coeff: float, default=0.01
        weight factor for entropy loss term

    Raises
    ------
    ValueError:
        if `actor` or `critic` are not specified
    """
    self.gamma = gamma
    self.entropy_coeff = entropy_coeff
    self.lr = lr

    # Networks
    if actor is None or critic is None:
        raise ValueError(
            'Please provide BaseNetwork for actor and critic. Currently set to None.'
        )
    self.actor = actor
    self.critic = critic
    self.nets = {'actor': self.actor, 'critic': self.critic}

    self.opt_actor = Adam(self.actor.parameters(), self.lr)
    self.opt_critic = Adam(self.critic.parameters(), self.lr)
def reset(self) ‑> NoneType

Reset the network weights and optimizers.

Expand source code
@torch.no_grad()
def reset(self) -> None:
    """
    Reset the network weights and optimizers.
    """
    # Reset actor weights
    for layers in self.actor.children():
        for layer in layers:
            if hasattr(layer, 'reset_parameters'):
                layer.reset_parameters()

    # Reset critic weights
    for layers in self.critic.children():
        for layer in layers:
            if hasattr(layer, 'reset_parameters'):
                layer.reset_parameters()

    # Reset optimizers
    self.opt_actor = Adam(self.actor.parameters(), self.lr)
    self.opt_critic = Adam(self.critic.parameters(), self.lr)
def step(self, state: torch.Tensor) ‑> numpy.ndarray

Find best action for the given state according to the current policy.

Parameters

state : torch.Tensor
state tensor, of size (batch_size, state_shape)

Returns

np.ndarray:
selected actions, of size (batch_size, action_shape)
Expand source code
@torch.no_grad()
def step(self, state: torch.Tensor) -> np.ndarray:
    """
    Find best action for the given state according to the current policy.

    Parameters
    ----------
    state: torch.Tensor
        state tensor, of size (batch_size, state_shape)

    Returns
    -------
    np.ndarray:
        selected actions, of size (batch_size, action_shape)
    """
    _, actions = self.actor.get_action(state)

    if self.device == 'gpu':
        actions = actions.cpu().numpy()
    else:
        actions = actions.numpy()
    return actions
def update(self, batch: Dict[str, torch.Tensor]) ‑> Dict[str, float]

Update agent policy based on batch of experiences.

Parameters

batch : Dict[str, torch.Tensor]
batch of transitions, with keys states, actions, rewards, dones, and next_states. Values should be of size (num_steps, num_env, item_shape)

Returns

Dict[str, float]:
losses for the update step, key strings and loss values can be automatically recorded to TensorBoard
Expand source code
def update(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
    """
    Update agent policy based on batch of experiences.

    Parameters
    ----------
    batch: Dict[str, torch.Tensor]
        batch of transitions, with keys `states`, `actions`, `rewards`, 
        `dones`, and `next_states`. Values should be of size (num_steps, 
        num_env, item_shape)

    Returns
    -------
    Dict[str, float]:
        losses for the update step, key strings and loss values can be 
        automatically recorded to TensorBoard
    """
    # Compute values used for losses
    final_states = batch['next_states'][-1]
    value_final = self.critic(final_states).squeeze()
    targets = compute_target(
        value_final, batch['rewards'], 1 - batch['dones'], self.gamma
    ).reshape(-1)
    if self.device == 'gpu':
        targets = targets.cuda()

    batch = flatten_batch(batch)
    values = self.critic(batch['states']).squeeze()
    advantages = targets - values

    dist, _ = self.actor.get_action(batch['states'])
    actions = batch['actions']
    log_action_probs = dist.log_prob(actions)
    if len(log_action_probs.shape) > 1:
        log_action_probs = log_action_probs.sum(axis=-1)

    # Compute losses
    loss_action = -(log_action_probs * advantages.detach()).mean()
    loss_entropy = self.entropy_coeff * dist.entropy().mean()
    loss_actor = loss_action - loss_entropy
    loss_critic = F.smooth_l1_loss(values, targets)

    # Updates
    self.opt_actor.zero_grad()
    self.opt_critic.zero_grad()
    loss_actor.backward()
    loss_critic.backward()
    torch.nn.utils.clip_grad_norm_(
        self.actor.parameters(), 5
    )
    torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 5)
    self.opt_actor.step()
    self.opt_critic.step()

    # Return loss dictionary
    loss_dict = {
        'loss/actor': loss_actor.item(),
        'loss/critic': loss_critic.item(),
        'loss/total': loss_actor.item() + loss_critic.item(),
    }
    return loss_dict

Inherited members