Module ilpyt.runners.runner
The runner coordinates the agent-environment interaction loop.
It collects
transitions (state, action, reward, next state) over specified intervals of
time.
We can have the runner generate a collection of transitions for us by
calling generate_batch
and generate_episodes
.
Expand source code
"""
The runner coordinates the agent-environment interaction loop. It collects
transitions (state, action, reward, next state) over specified intervals of
time. We can have the runner generate a collection of transitions for us by
calling `generate_batch` and `generate_episodes`.
"""
from typing import Any, Dict, List
import numpy as np
import torch
from ilpyt.agents.base_agent import BaseAgent
from ilpyt.envs.vec_env import VecEnv
from ilpyt.utils.seed_utils import set_seed
class Experiences:
def __init__(self) -> None:
"""
Initialize experiences object, which stores a stack of agent-environment
transitions.
"""
self.states = []
self.actions = []
self.rewards = []
self.dones = []
def add(
self,
state: torch.Tensor,
action: torch.Tensor,
reward: torch.Tensor,
done: torch.Tensor,
) -> None:
"""
Add a transition to the stack of transitions.
Parameters
----------
state: torch.Tensor
action: torch.Tensor
reward: torch.Tensor
done: torch.Tensor
"""
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.dones.append(done)
def to_torch(self) -> None:
"""
Convert the stack of transitions from a list of torch.Tensors to a
single instance of torch.Tensor.
"""
self.states = torch.stack(self.states)
self.actions = torch.stack(self.actions)
self.rewards = torch.tensor(self.rewards)
self.dones = torch.tensor(self.dones)
def to_gpu(self) -> None:
"""
Place the experience on the GPU.
"""
self.states = self.states.cuda()
self.actions = self.actions.cuda()
self.rewards = self.rewards.cuda()
self.dones = self.dones.cuda()
def get_episode_rewards(self) -> List[float]:
"""
Get the episode rewards.
Returns
-------
List[float]:
list of episode rewards
"""
cumulative_rewards = []
episode_ends = torch.where(self.dones)[0] + 1
for i in range(len(episode_ends)):
if i == 0:
start = 0
else:
start = episode_ends[i - 1]
end = episode_ends[i]
r = torch.sum(self.rewards[start:end]).item()
cumulative_rewards.append(r)
return cumulative_rewards
class Runner:
def __init__(self, env: VecEnv, agent: BaseAgent, use_gpu: bool) -> None:
"""
The runner manages the agent and environment interaction to collect
episodes and/or rollouts.
Parameters
----------
env: VecEnv
Multiprocessing compatible gym environment
agent: BaseAgent
Agent to collect rollouts or episode experiences from
use_gpu: bool
whether or not to use GPU, if false use CPU
"""
self.env = env
self.agent = agent
self.num_env = self.env.num_envs
self.use_gpu = use_gpu
if self.use_gpu:
self.agent.to_gpu()
# Initialize state
self.state = torch.tensor(self.env.reset())
if self.use_gpu:
self.state = self.state.cuda()
# Episode statistics
# Each list entry corresponds to a different parallel environment.
self.episode_stats = {
'reward': np.zeros(self.num_env),
'length': np.zeros(self.num_env),
'count': np.zeros(self.num_env),
}
def reset(self) -> None:
"""
Reset the state and episode stats within the Runner.
"""
self.state = torch.tensor(self.env.reset())
if self.use_gpu:
self.agent.to_gpu()
self.state = self.state.cuda()
self.episode_stats = {
'reward': np.zeros(self.num_env),
'length': np.zeros(self.num_env),
'count': np.zeros(self.num_env),
}
@torch.no_grad()
def generate_batch(self, rollout_steps: int) -> Dict[str, torch.Tensor]:
"""
Generate a batch of rollouts.
Will return a dictionary with keys: states, next_states, actions,
rewards, dones, and infos.
- states and next_states will have a shape of (rollout_steps, num_env, state_shape).
- actions will have a shape of (rollout_steps, num_env, act_shape).
- rewards and dones will have a shape of (rollout_steps, num_env).
- infos will contain episode metadata -- it will be expressed as a list of dictionaries with values summarizing the episode_length and total_reward accumulated.
Parameters
----------
rollout_steps: int
number of rollout steps to collect
Returns
-------
Dict[str, torch.Tensor]:
batch of rollouts with keys: states, next_states, actions, rewards,
dones, and infos
"""
# Initialize batch
batch_size = (
rollout_steps,
self.num_env,
)
obs_shape = batch_size + self.env.observation_shape
if self.env.type == 'discrete':
act_shape = batch_size
else:
act_shape = batch_size + self.env.action_shape
batch: Dict[str, Any] = {
'states': torch.empty(obs_shape),
'next_states': torch.empty(obs_shape),
'actions': torch.empty(act_shape),
'rewards': torch.empty(batch_size),
'dones': torch.empty(batch_size),
'infos': [],
}
for step in range(rollout_steps):
# Agent takes action
action = self.agent.step(self.state)
# Update environment
next_state, reward, done, info = self.env.step(action)
# print('REWARD IN RUNNER:' , reward)
# Record transition to batch
batch['states'][step] = torch.as_tensor(self.state)
batch['next_states'][step] = torch.as_tensor(next_state)
batch['actions'][step] = torch.tensor(
action, dtype=torch.float, requires_grad=True
)
batch['rewards'][step] = torch.as_tensor(reward)
batch['dones'][step] = torch.as_tensor(done)
# Update episode stats
self.episode_stats['reward'] += reward
self.episode_stats['length'] += np.ones(self.num_env)
self.episode_stats['count'] += done
# On episode end, update batch infos and reset
for i in range(self.num_env):
if done[i]:
update_dict = {
'reward/%i' % i: self.episode_stats['reward'][i],
'length/%i' % i: self.episode_stats['length'][i],
}
update = [self.episode_stats['count'][i], update_dict]
batch['infos'].append(update)
self.episode_stats['reward'][i] = 0
self.episode_stats['length'][i] = 0
# Update state
self.state = torch.tensor(next_state)
if self.use_gpu:
self.state = self.state.cuda()
# Batch to GPU
if self.use_gpu:
for (k, v) in batch.items():
if k != 'infos':
batch[k] = v.cuda()
return batch
@torch.no_grad()
def generate_episodes(self, num_episodes: int) -> Experiences:
"""
Generate episodes.
Only records states, actions, rewards.
Will return a list of torch Tensors.
Parameters
----------
num_episodes: int
number of episodes to collectively acquire across all of the
environment threads
Returns
-------
Experiences (Dict[str, torch.Tensor]]):
{'states': [], 'actions': [], 'rewards': [], 'dones': []}
"""
# Initialize batch
eps_by_env = [Experiences() for i in range(self.num_env)]
all_episodes = []
ep_count = 0
self.env.reset()
while ep_count < num_episodes:
# Agent takes action
action = self.agent.step(self.state)
# Update environment
next_state, reward, done, info = self.env.step(action)
# Record transition to batch
# On episode end, update batch infos and reset
for i in range(self.num_env):
# Record transition to buffer
eps_by_env[i].add(
torch.as_tensor(self.state[i]),
torch.as_tensor(action[i]),
torch.as_tensor(reward[i]),
torch.as_tensor(done[i]),
)
# On episode end, move from buffer to result_dict
if done[i]:
all_episodes.append(eps_by_env[i])
next_state[i] = self.env.envs[i].reset()
eps_by_env[i] = Experiences()
ep_count += 1
if ep_count >= num_episodes:
break
# Update state
self.state = torch.tensor(next_state)
if self.use_gpu:
self.state = self.state.cuda()
# Combine experiences across all environments
eps = Experiences()
for i in range(len(all_episodes)):
eps.states += all_episodes[i].states
eps.actions += all_episodes[i].actions
eps.rewards += all_episodes[i].rewards
eps.dones += all_episodes[i].dones
eps.to_torch()
if self.use_gpu:
eps.to_gpu()
return eps
@torch.no_grad()
def generate_test_episodes(
self, num_episodes: int, start_seed=24
) -> Experiences:
"""
Generate episodes using a single env with seeds for reproducibility.
Only records states, actions, rewards.
Will return a list of torch Tensors.
Use for testing when you need to compare against other algorithms or runs.
Parameters
----------
num_episodes: int
number of episodes to collectively acquire across all of the
environment threads
Returns
-------
Experiences (Dict[str, torch.Tensor]]):
{'states': [], 'actions': [], 'rewards': [], 'dones': []}
"""
# Initialize batch
eps = Experiences()
test_env = self.env.envs[0]
ep_count = 0
test_env.seed(start_seed * (ep_count + 1))
set_seed(start_seed * (ep_count + 1))
test_state = torch.tensor(
test_env.reset().copy(), dtype=torch.float
).unsqueeze(0)
if self.use_gpu:
test_state = test_state.cuda()
while ep_count < num_episodes:
# Agent takes action
action = self.agent.step(test_state)
# Update environment
next_state, reward, done, info = test_env.step(action[0])
# Record transition to batch
# On episode end, update batch infos and reset
eps.add(
torch.as_tensor(test_state.squeeze()),
torch.as_tensor(action.squeeze()),
torch.as_tensor(reward),
torch.as_tensor(done),
)
if done:
ep_count += 1
test_env.seed(start_seed * (ep_count + 1))
set_seed(start_seed * (ep_count + 1))
test_state = torch.tensor(
test_env.reset().copy(), dtype=torch.float
).unsqueeze(0)
else:
# Update state
test_state = torch.tensor(
next_state.copy(), dtype=torch.float
).unsqueeze(0)
if self.use_gpu:
test_state = test_state.cuda()
eps.to_torch()
if self.use_gpu:
eps.to_gpu()
return eps
Classes
class Experiences
-
Initialize experiences object, which stores a stack of agent-environment transitions.
Expand source code
class Experiences: def __init__(self) -> None: """ Initialize experiences object, which stores a stack of agent-environment transitions. """ self.states = [] self.actions = [] self.rewards = [] self.dones = [] def add( self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor, done: torch.Tensor, ) -> None: """ Add a transition to the stack of transitions. Parameters ---------- state: torch.Tensor action: torch.Tensor reward: torch.Tensor done: torch.Tensor """ self.states.append(state) self.actions.append(action) self.rewards.append(reward) self.dones.append(done) def to_torch(self) -> None: """ Convert the stack of transitions from a list of torch.Tensors to a single instance of torch.Tensor. """ self.states = torch.stack(self.states) self.actions = torch.stack(self.actions) self.rewards = torch.tensor(self.rewards) self.dones = torch.tensor(self.dones) def to_gpu(self) -> None: """ Place the experience on the GPU. """ self.states = self.states.cuda() self.actions = self.actions.cuda() self.rewards = self.rewards.cuda() self.dones = self.dones.cuda() def get_episode_rewards(self) -> List[float]: """ Get the episode rewards. Returns ------- List[float]: list of episode rewards """ cumulative_rewards = [] episode_ends = torch.where(self.dones)[0] + 1 for i in range(len(episode_ends)): if i == 0: start = 0 else: start = episode_ends[i - 1] end = episode_ends[i] r = torch.sum(self.rewards[start:end]).item() cumulative_rewards.append(r) return cumulative_rewards
Methods
def add(self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor, done: torch.Tensor) ‑> NoneType
-
Add a transition to the stack of transitions.
Parameters
state
:torch.Tensor
action
:torch.Tensor
reward
:torch.Tensor
done
:torch.Tensor
Expand source code
def add( self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor, done: torch.Tensor, ) -> None: """ Add a transition to the stack of transitions. Parameters ---------- state: torch.Tensor action: torch.Tensor reward: torch.Tensor done: torch.Tensor """ self.states.append(state) self.actions.append(action) self.rewards.append(reward) self.dones.append(done)
def get_episode_rewards(self) ‑> List[float]
-
Get the episode rewards.
Returns
List[float]:
- list of episode rewards
Expand source code
def get_episode_rewards(self) -> List[float]: """ Get the episode rewards. Returns ------- List[float]: list of episode rewards """ cumulative_rewards = [] episode_ends = torch.where(self.dones)[0] + 1 for i in range(len(episode_ends)): if i == 0: start = 0 else: start = episode_ends[i - 1] end = episode_ends[i] r = torch.sum(self.rewards[start:end]).item() cumulative_rewards.append(r) return cumulative_rewards
def to_gpu(self) ‑> NoneType
-
Place the experience on the GPU.
Expand source code
def to_gpu(self) -> None: """ Place the experience on the GPU. """ self.states = self.states.cuda() self.actions = self.actions.cuda() self.rewards = self.rewards.cuda() self.dones = self.dones.cuda()
def to_torch(self) ‑> NoneType
-
Convert the stack of transitions from a list of torch.Tensors to a single instance of torch.Tensor.
Expand source code
def to_torch(self) -> None: """ Convert the stack of transitions from a list of torch.Tensors to a single instance of torch.Tensor. """ self.states = torch.stack(self.states) self.actions = torch.stack(self.actions) self.rewards = torch.tensor(self.rewards) self.dones = torch.tensor(self.dones)
class Runner (env: VecEnv, agent: BaseAgent, use_gpu: bool)
-
The runner manages the agent and environment interaction to collect episodes and/or rollouts.
Parameters
env
:VecEnv
- Multiprocessing compatible gym environment
agent
:BaseAgent
- Agent to collect rollouts or episode experiences from
use_gpu
:bool
- whether or not to use GPU, if false use CPU
Expand source code
class Runner: def __init__(self, env: VecEnv, agent: BaseAgent, use_gpu: bool) -> None: """ The runner manages the agent and environment interaction to collect episodes and/or rollouts. Parameters ---------- env: VecEnv Multiprocessing compatible gym environment agent: BaseAgent Agent to collect rollouts or episode experiences from use_gpu: bool whether or not to use GPU, if false use CPU """ self.env = env self.agent = agent self.num_env = self.env.num_envs self.use_gpu = use_gpu if self.use_gpu: self.agent.to_gpu() # Initialize state self.state = torch.tensor(self.env.reset()) if self.use_gpu: self.state = self.state.cuda() # Episode statistics # Each list entry corresponds to a different parallel environment. self.episode_stats = { 'reward': np.zeros(self.num_env), 'length': np.zeros(self.num_env), 'count': np.zeros(self.num_env), } def reset(self) -> None: """ Reset the state and episode stats within the Runner. """ self.state = torch.tensor(self.env.reset()) if self.use_gpu: self.agent.to_gpu() self.state = self.state.cuda() self.episode_stats = { 'reward': np.zeros(self.num_env), 'length': np.zeros(self.num_env), 'count': np.zeros(self.num_env), } @torch.no_grad() def generate_batch(self, rollout_steps: int) -> Dict[str, torch.Tensor]: """ Generate a batch of rollouts. Will return a dictionary with keys: states, next_states, actions, rewards, dones, and infos. - states and next_states will have a shape of (rollout_steps, num_env, state_shape). - actions will have a shape of (rollout_steps, num_env, act_shape). - rewards and dones will have a shape of (rollout_steps, num_env). - infos will contain episode metadata -- it will be expressed as a list of dictionaries with values summarizing the episode_length and total_reward accumulated. Parameters ---------- rollout_steps: int number of rollout steps to collect Returns ------- Dict[str, torch.Tensor]: batch of rollouts with keys: states, next_states, actions, rewards, dones, and infos """ # Initialize batch batch_size = ( rollout_steps, self.num_env, ) obs_shape = batch_size + self.env.observation_shape if self.env.type == 'discrete': act_shape = batch_size else: act_shape = batch_size + self.env.action_shape batch: Dict[str, Any] = { 'states': torch.empty(obs_shape), 'next_states': torch.empty(obs_shape), 'actions': torch.empty(act_shape), 'rewards': torch.empty(batch_size), 'dones': torch.empty(batch_size), 'infos': [], } for step in range(rollout_steps): # Agent takes action action = self.agent.step(self.state) # Update environment next_state, reward, done, info = self.env.step(action) # print('REWARD IN RUNNER:' , reward) # Record transition to batch batch['states'][step] = torch.as_tensor(self.state) batch['next_states'][step] = torch.as_tensor(next_state) batch['actions'][step] = torch.tensor( action, dtype=torch.float, requires_grad=True ) batch['rewards'][step] = torch.as_tensor(reward) batch['dones'][step] = torch.as_tensor(done) # Update episode stats self.episode_stats['reward'] += reward self.episode_stats['length'] += np.ones(self.num_env) self.episode_stats['count'] += done # On episode end, update batch infos and reset for i in range(self.num_env): if done[i]: update_dict = { 'reward/%i' % i: self.episode_stats['reward'][i], 'length/%i' % i: self.episode_stats['length'][i], } update = [self.episode_stats['count'][i], update_dict] batch['infos'].append(update) self.episode_stats['reward'][i] = 0 self.episode_stats['length'][i] = 0 # Update state self.state = torch.tensor(next_state) if self.use_gpu: self.state = self.state.cuda() # Batch to GPU if self.use_gpu: for (k, v) in batch.items(): if k != 'infos': batch[k] = v.cuda() return batch @torch.no_grad() def generate_episodes(self, num_episodes: int) -> Experiences: """ Generate episodes. Only records states, actions, rewards. Will return a list of torch Tensors. Parameters ---------- num_episodes: int number of episodes to collectively acquire across all of the environment threads Returns ------- Experiences (Dict[str, torch.Tensor]]): {'states': [], 'actions': [], 'rewards': [], 'dones': []} """ # Initialize batch eps_by_env = [Experiences() for i in range(self.num_env)] all_episodes = [] ep_count = 0 self.env.reset() while ep_count < num_episodes: # Agent takes action action = self.agent.step(self.state) # Update environment next_state, reward, done, info = self.env.step(action) # Record transition to batch # On episode end, update batch infos and reset for i in range(self.num_env): # Record transition to buffer eps_by_env[i].add( torch.as_tensor(self.state[i]), torch.as_tensor(action[i]), torch.as_tensor(reward[i]), torch.as_tensor(done[i]), ) # On episode end, move from buffer to result_dict if done[i]: all_episodes.append(eps_by_env[i]) next_state[i] = self.env.envs[i].reset() eps_by_env[i] = Experiences() ep_count += 1 if ep_count >= num_episodes: break # Update state self.state = torch.tensor(next_state) if self.use_gpu: self.state = self.state.cuda() # Combine experiences across all environments eps = Experiences() for i in range(len(all_episodes)): eps.states += all_episodes[i].states eps.actions += all_episodes[i].actions eps.rewards += all_episodes[i].rewards eps.dones += all_episodes[i].dones eps.to_torch() if self.use_gpu: eps.to_gpu() return eps @torch.no_grad() def generate_test_episodes( self, num_episodes: int, start_seed=24 ) -> Experiences: """ Generate episodes using a single env with seeds for reproducibility. Only records states, actions, rewards. Will return a list of torch Tensors. Use for testing when you need to compare against other algorithms or runs. Parameters ---------- num_episodes: int number of episodes to collectively acquire across all of the environment threads Returns ------- Experiences (Dict[str, torch.Tensor]]): {'states': [], 'actions': [], 'rewards': [], 'dones': []} """ # Initialize batch eps = Experiences() test_env = self.env.envs[0] ep_count = 0 test_env.seed(start_seed * (ep_count + 1)) set_seed(start_seed * (ep_count + 1)) test_state = torch.tensor( test_env.reset().copy(), dtype=torch.float ).unsqueeze(0) if self.use_gpu: test_state = test_state.cuda() while ep_count < num_episodes: # Agent takes action action = self.agent.step(test_state) # Update environment next_state, reward, done, info = test_env.step(action[0]) # Record transition to batch # On episode end, update batch infos and reset eps.add( torch.as_tensor(test_state.squeeze()), torch.as_tensor(action.squeeze()), torch.as_tensor(reward), torch.as_tensor(done), ) if done: ep_count += 1 test_env.seed(start_seed * (ep_count + 1)) set_seed(start_seed * (ep_count + 1)) test_state = torch.tensor( test_env.reset().copy(), dtype=torch.float ).unsqueeze(0) else: # Update state test_state = torch.tensor( next_state.copy(), dtype=torch.float ).unsqueeze(0) if self.use_gpu: test_state = test_state.cuda() eps.to_torch() if self.use_gpu: eps.to_gpu() return eps
Methods
def generate_batch(self, rollout_steps: int) ‑> Dict[str, torch.Tensor]
-
Generate a batch of rollouts.
Will return a dictionary with keys: states, next_states, actions, rewards, dones, and infos.
- states and next_states will have a shape of (rollout_steps, num_env, state_shape).
- actions will have a shape of (rollout_steps, num_env, act_shape).
- rewards and dones will have a shape of (rollout_steps, num_env).
- infos will contain episode metadata – it will be expressed as a list of dictionaries with values summarizing the episode_length and total_reward accumulated.
Parameters
rollout_steps
:int
- number of rollout steps to collect
Returns
Dict[str, torch.Tensor]:
- batch of rollouts with keys: states, next_states, actions, rewards, dones, and infos
Expand source code
@torch.no_grad() def generate_batch(self, rollout_steps: int) -> Dict[str, torch.Tensor]: """ Generate a batch of rollouts. Will return a dictionary with keys: states, next_states, actions, rewards, dones, and infos. - states and next_states will have a shape of (rollout_steps, num_env, state_shape). - actions will have a shape of (rollout_steps, num_env, act_shape). - rewards and dones will have a shape of (rollout_steps, num_env). - infos will contain episode metadata -- it will be expressed as a list of dictionaries with values summarizing the episode_length and total_reward accumulated. Parameters ---------- rollout_steps: int number of rollout steps to collect Returns ------- Dict[str, torch.Tensor]: batch of rollouts with keys: states, next_states, actions, rewards, dones, and infos """ # Initialize batch batch_size = ( rollout_steps, self.num_env, ) obs_shape = batch_size + self.env.observation_shape if self.env.type == 'discrete': act_shape = batch_size else: act_shape = batch_size + self.env.action_shape batch: Dict[str, Any] = { 'states': torch.empty(obs_shape), 'next_states': torch.empty(obs_shape), 'actions': torch.empty(act_shape), 'rewards': torch.empty(batch_size), 'dones': torch.empty(batch_size), 'infos': [], } for step in range(rollout_steps): # Agent takes action action = self.agent.step(self.state) # Update environment next_state, reward, done, info = self.env.step(action) # print('REWARD IN RUNNER:' , reward) # Record transition to batch batch['states'][step] = torch.as_tensor(self.state) batch['next_states'][step] = torch.as_tensor(next_state) batch['actions'][step] = torch.tensor( action, dtype=torch.float, requires_grad=True ) batch['rewards'][step] = torch.as_tensor(reward) batch['dones'][step] = torch.as_tensor(done) # Update episode stats self.episode_stats['reward'] += reward self.episode_stats['length'] += np.ones(self.num_env) self.episode_stats['count'] += done # On episode end, update batch infos and reset for i in range(self.num_env): if done[i]: update_dict = { 'reward/%i' % i: self.episode_stats['reward'][i], 'length/%i' % i: self.episode_stats['length'][i], } update = [self.episode_stats['count'][i], update_dict] batch['infos'].append(update) self.episode_stats['reward'][i] = 0 self.episode_stats['length'][i] = 0 # Update state self.state = torch.tensor(next_state) if self.use_gpu: self.state = self.state.cuda() # Batch to GPU if self.use_gpu: for (k, v) in batch.items(): if k != 'infos': batch[k] = v.cuda() return batch
def generate_episodes(self, num_episodes: int) ‑> Experiences
-
Generate episodes. Only records states, actions, rewards.
Will return a list of torch Tensors.
Parameters
num_episodes
:int
- number of episodes to collectively acquire across all of the environment threads
Returns
Experiences (Dict[str, torch.Tensor]]):
- {'states': [], 'actions': [], 'rewards': [], 'dones': []}
Expand source code
@torch.no_grad() def generate_episodes(self, num_episodes: int) -> Experiences: """ Generate episodes. Only records states, actions, rewards. Will return a list of torch Tensors. Parameters ---------- num_episodes: int number of episodes to collectively acquire across all of the environment threads Returns ------- Experiences (Dict[str, torch.Tensor]]): {'states': [], 'actions': [], 'rewards': [], 'dones': []} """ # Initialize batch eps_by_env = [Experiences() for i in range(self.num_env)] all_episodes = [] ep_count = 0 self.env.reset() while ep_count < num_episodes: # Agent takes action action = self.agent.step(self.state) # Update environment next_state, reward, done, info = self.env.step(action) # Record transition to batch # On episode end, update batch infos and reset for i in range(self.num_env): # Record transition to buffer eps_by_env[i].add( torch.as_tensor(self.state[i]), torch.as_tensor(action[i]), torch.as_tensor(reward[i]), torch.as_tensor(done[i]), ) # On episode end, move from buffer to result_dict if done[i]: all_episodes.append(eps_by_env[i]) next_state[i] = self.env.envs[i].reset() eps_by_env[i] = Experiences() ep_count += 1 if ep_count >= num_episodes: break # Update state self.state = torch.tensor(next_state) if self.use_gpu: self.state = self.state.cuda() # Combine experiences across all environments eps = Experiences() for i in range(len(all_episodes)): eps.states += all_episodes[i].states eps.actions += all_episodes[i].actions eps.rewards += all_episodes[i].rewards eps.dones += all_episodes[i].dones eps.to_torch() if self.use_gpu: eps.to_gpu() return eps
def generate_test_episodes(self, num_episodes: int, start_seed=24) ‑> Experiences
-
Generate episodes using a single env with seeds for reproducibility. Only records states, actions, rewards.
Will return a list of torch Tensors.
Use for testing when you need to compare against other algorithms or runs.
Parameters
num_episodes
:int
- number of episodes to collectively acquire across all of the environment threads
Returns
Experiences (Dict[str, torch.Tensor]]):
- {'states': [], 'actions': [], 'rewards': [], 'dones': []}
Expand source code
@torch.no_grad() def generate_test_episodes( self, num_episodes: int, start_seed=24 ) -> Experiences: """ Generate episodes using a single env with seeds for reproducibility. Only records states, actions, rewards. Will return a list of torch Tensors. Use for testing when you need to compare against other algorithms or runs. Parameters ---------- num_episodes: int number of episodes to collectively acquire across all of the environment threads Returns ------- Experiences (Dict[str, torch.Tensor]]): {'states': [], 'actions': [], 'rewards': [], 'dones': []} """ # Initialize batch eps = Experiences() test_env = self.env.envs[0] ep_count = 0 test_env.seed(start_seed * (ep_count + 1)) set_seed(start_seed * (ep_count + 1)) test_state = torch.tensor( test_env.reset().copy(), dtype=torch.float ).unsqueeze(0) if self.use_gpu: test_state = test_state.cuda() while ep_count < num_episodes: # Agent takes action action = self.agent.step(test_state) # Update environment next_state, reward, done, info = test_env.step(action[0]) # Record transition to batch # On episode end, update batch infos and reset eps.add( torch.as_tensor(test_state.squeeze()), torch.as_tensor(action.squeeze()), torch.as_tensor(reward), torch.as_tensor(done), ) if done: ep_count += 1 test_env.seed(start_seed * (ep_count + 1)) set_seed(start_seed * (ep_count + 1)) test_state = torch.tensor( test_env.reset().copy(), dtype=torch.float ).unsqueeze(0) else: # Update state test_state = torch.tensor( next_state.copy(), dtype=torch.float ).unsqueeze(0) if self.use_gpu: test_state = test_state.cuda() eps.to_torch() if self.use_gpu: eps.to_gpu() return eps
def reset(self) ‑> NoneType
-
Reset the state and episode stats within the Runner.
Expand source code
def reset(self) -> None: """ Reset the state and episode stats within the Runner. """ self.state = torch.tensor(self.env.reset()) if self.use_gpu: self.agent.to_gpu() self.state = self.state.cuda() self.episode_stats = { 'reward': np.zeros(self.num_env), 'length': np.zeros(self.num_env), 'count': np.zeros(self.num_env), }