diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..fc86bf81 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,38 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ master, develop ] + pull_request: + branches: [ master, develop ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt-get install swig + sudo apt-get install unrar + pip install torch==1.8.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + make install + AutoROM -v + - name: Lint code + run: | + make lint + - name: Run tests + run: | + make test diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..1a03a7b6 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.gitignore b/.gitignore index 3d655dcf..686cc724 100644 --- a/.gitignore +++ b/.gitignore @@ -10,9 +10,14 @@ autonomous_learning_library.egg-info # editor .vscode .idea +*.code-workspace # non-committed code local legacy /runs /out + +# notebooks +*.ipynb +*.ipynb_checkpoints diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 29b7982e..00000000 --- a/.travis.yml +++ /dev/null @@ -1,17 +0,0 @@ -language: python -python: - - "3.6" - - "3.7" - - "3.8" -branches: - only: - - master - - develop -before_install: - - sudo apt-get install swig -install: - - pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html - - pip install -q -e .["dev"] -script: - - make lint - - make test diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ca7d0d7..2f7668d8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,6 +26,12 @@ The unit tests may be run using: make test ``` +You can automatically format your code to match our code style using: + +``` +make format +``` + Finally, you rebuild the documentation using: ``` diff --git a/Makefile b/Makefile index 5413d36f..094d405f 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,20 @@ install: pip install -e .[dev] -lint: - pylint all --rcfile=.pylintrc +test: unit-test integration-test -test: +unit-test: python -m unittest discover -s all -p "*test.py" +integration-test: + python -m unittest discover -s integration -p "*test.py" + +lint: + flake8 --ignore "E501,E731,E74,E402,F401,W503,E128" all + +format: + autopep8 --in-place --aggressive --aggressive --ignore "E501,E731,E74,E402,F401,W503,E128" -r all + tensorboard: tensorboard --logdir runs diff --git a/all/.DS_Store b/all/.DS_Store new file mode 100644 index 00000000..85d71962 Binary files /dev/null and b/all/.DS_Store differ diff --git a/all/__init__.py b/all/__init__.py index 77ba8012..3bfd56fc 100644 --- a/all/__init__.py +++ b/all/__init__.py @@ -1,4 +1,26 @@ +import all.agents +import all.approximation +import all.core +import all.environments +import all.logging +import all.memory import all.nn +import all.optim +import all.policies +import all.presets from all.core import State, StateArray -__all__ = ['nn', 'State', 'StateArray'] +__all__ = [ + 'agents', + 'approximation', + 'core', + 'environments', + 'logging', + 'memory', + 'nn', + 'optim', + 'policies', + 'presets', + 'State', + 'StateArray' +] diff --git a/all/agents/__init__.py b/all/agents/__init__.py index c9ae31f4..2811a056 100644 --- a/all/agents/__init__.py +++ b/all/agents/__init__.py @@ -1,29 +1,50 @@ from ._agent import Agent -from .a2c import A2C -from .c51 import C51 -from .ddpg import DDPG -from .ddqn import DDQN -from .dqn import DQN -from .ppo import PPO -from .rainbow import Rainbow -from .sac import SAC -from .vac import VAC -from .vpg import VPG -from .vqn import VQN -from .vsarsa import VSarsa +from ._multiagent import Multiagent +from ._parallel_agent import ParallelAgent +from .a2c import A2C, A2CTestAgent +from .c51 import C51, C51TestAgent +from .ddpg import DDPG, DDPGTestAgent +from .ddqn import DDQN, DDQNTestAgent +from .dqn import DQN, DQNTestAgent +from .independent import IndependentMultiagent +from .ppo import PPO, PPOTestAgent +from .rainbow import Rainbow, RainbowTestAgent +from .sac import SAC, SACTestAgent +from .vac import VAC, VACTestAgent +from .vpg import VPG, VPGTestAgent +from .vqn import VQN, VQNTestAgent +from .vsarsa import VSarsa, VSarsaTestAgent + __all__ = [ + # Agent interfaces "Agent", + "Multiagent", + "ParallelAgent", + # Agent implementations "A2C", + "A2CTestAgent", "C51", + "C51TestAgent", "DDPG", + "DDPGTestAgent", "DDQN", + "DDQNTestAgent", "DQN", + "DQNTestAgent", "PPO", + "PPOTestAgent", "Rainbow", + "RainbowTestAgent", "SAC", + "SACTestAgent", "VAC", + "VACTestAgent", "VPG", + "VPGTestAgent", "VQN", + "VQNTestAgent", "VSarsa", + "VSarsaTestAgent", + "IndependentMultiagent", ] diff --git a/all/agents/_agent.py b/all/agents/_agent.py index 91c8f69d..45038f14 100644 --- a/all/agents/_agent.py +++ b/all/agents/_agent.py @@ -1,15 +1,16 @@ from abc import ABC, abstractmethod from all.optim import Schedulable + class Agent(ABC, Schedulable): """ A reinforcement learning agent. In reinforcement learning, an Agent learns by interacting with an Environment. - Usually, an agent tries to maximize a reward signal. + Usually, an Agent tries to maximize a reward signal. It does this by observing environment "states", taking "actions", receiving "rewards", - and in doing so, learning which state-action pairs correlate with high rewards. - An Agent implementation should encapsulate some particular reinforcement learning algorihthm. + and learning which state-action pairs correlate with high rewards. + An Agent implementation should encapsulate some particular reinforcement learning algorithm. """ @abstractmethod @@ -31,20 +32,3 @@ def act(self, state): Returns: torch.Tensor: The action to take at the current timestep. """ - - @abstractmethod - def eval(self, state): - """ - Select an action for the current timestep in evaluation mode. - - Unlike act, this method should NOT update the internal parameters of the agent. - Most of the time, this method should return the greedy action according to the current policy. - This method is useful when using evaluation methodologies that distinguish between the performance - of the agent during training and the performance of the resulting policy. - - Args: - state (all.environment.State): The environment state at the current timestep. - - Returns: - torch.Tensor: The action to take at the current timestep. - """ diff --git a/all/agents/_multiagent.py b/all/agents/_multiagent.py new file mode 100644 index 00000000..11f4a18d --- /dev/null +++ b/all/agents/_multiagent.py @@ -0,0 +1,34 @@ +from abc import ABC, abstractmethod +from all.optim import Schedulable + + +class Multiagent(ABC, Schedulable): + """ + A multiagent RL agent. Differs from standard agents in that it accepts a multiagent state. + + In reinforcement learning, an Agent learns by interacting with an Environment. + Usually, an agent tries to maximize a reward signal. + It does this by observing environment "states", taking "actions", receiving "rewards", + and learning which state-action pairs correlate with high rewards. + An Agent implementation should encapsulate some particular reinforcement learning algorithm. + """ + + @abstractmethod + def act(self, multiagent_state): + """ + Select an action for the current timestep and update internal parameters. + + In general, a reinforcement learning agent does several things during a timestep: + 1. Choose an action, + 2. Compute the TD error from the previous time step + 3. Update the value function and/or policy + The order of these steps differs depending on the agent. + This method allows the agent to do whatever is necessary for itself on a given timestep. + However, the agent must ultimately return an action. + + Args: + multiagent_state (all.core.MultiagentState): The environment state at the current timestep. + + Returns: + torch.Tensor: The action for the current agent to take at the current timestep. + """ diff --git a/all/agents/_parallel_agent.py b/all/agents/_parallel_agent.py new file mode 100644 index 00000000..b98ae618 --- /dev/null +++ b/all/agents/_parallel_agent.py @@ -0,0 +1,36 @@ +from abc import ABC, abstractmethod +from all.optim import Schedulable + + +class ParallelAgent(ABC, Schedulable): + """ + A reinforcement learning agent that chooses actions for multiple states simultaneously. + Differs from SingleAgent in that it accepts a StateArray instead of a State to process + input from multiple environments in parallel. + + In reinforcement learning, an Agent learns by interacting with an Environment. + Usually, an Agent tries to maximize a reward signal. + It does this by observing environment "states", taking "actions", receiving "rewards", + and learning which state-action pairs correlate with high rewards. + An Agent implementation should encapsulate some particular reinforcement learning algorithm. + """ + + @abstractmethod + def act(self, state_array): + """ + Select an action for the current timestep and update internal parameters. + + In general, a reinforcement learning agent does several things during a timestep: + 1. Choose an action, + 2. Compute the TD error from the previous time step + 3. Update the value function and/or policy + The order of these steps differs depending on the agent. + This method allows the agent to do whatever is necessary for itself on a given timestep. + However, the agent must ultimately return an action. + + Args: + state_array (all.environment.StateArray): An array of states for each parallel environment. + + Returns: + torch.Tensor: The actions to take for each parallel environmets. + """ diff --git a/all/agents/a2c.py b/all/agents/a2c.py index ad1494c9..f27a0cf8 100644 --- a/all/agents/a2c.py +++ b/all/agents/a2c.py @@ -1,10 +1,12 @@ +import torch from torch.nn.functional import mse_loss from all.logging import DummyWriter from all.memory import NStepAdvantageBuffer from ._agent import Agent +from ._parallel_agent import ParallelAgent -class A2C(Agent): +class A2C(ParallelAgent): """ Advantage Actor-Critic (A2C). A2C is policy gradient method in the actor-critic family. @@ -24,6 +26,7 @@ class A2C(Agent): n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout. writer (Writer): Used for logging. """ + def __init__( self, features, @@ -60,9 +63,6 @@ def act(self, states): self._actions = self.policy.no_grad(self.features.no_grad(states)).sample() return self._actions - def eval(self, states): - return self.policy.eval(self.features.eval(states)) - def _train(self, next_states): if len(self._buffer) >= self._batch_size: # load trajectories from buffer @@ -99,4 +99,12 @@ def _make_buffer(self): self.n_envs, discount_factor=self.discount_factor ) - \ No newline at end of file + + +class A2CTestAgent(Agent): + def __init__(self, features, policy): + self.features = features + self.policy = policy + + def act(self, state): + return self.policy.eval(self.features.eval(state)).sample() diff --git a/all/agents/c51.py b/all/agents/c51.py index b68daf60..972f2b46 100644 --- a/all/agents/c51.py +++ b/all/agents/c51.py @@ -114,3 +114,16 @@ def _kl(self, dist, target_dist): log_dist = torch.log(torch.clamp(dist, min=self.eps)) log_target_dist = torch.log(torch.clamp(target_dist, min=self.eps)) return (target_dist * (log_target_dist - log_dist)).sum(dim=-1) + + +class C51TestAgent(Agent): + def __init__(self, q_dist, n_actions, exploration=0.): + self.q_dist = q_dist + self.n_actions = n_actions + self.exploration = exploration + + def act(self, state): + if np.random.rand() < self.exploration: + return np.random.randint(0, self.n_actions) + q_values = (self.q_dist(state) * self.q_dist.atoms).sum(dim=-1) + return torch.argmax(q_values, dim=-1) diff --git a/all/agents/ddpg.py b/all/agents/ddpg.py index 3da06942..3cbec351 100644 --- a/all/agents/ddpg.py +++ b/all/agents/ddpg.py @@ -3,6 +3,7 @@ from torch.nn.functional import mse_loss from ._agent import Agent + class DDPG(Agent): """ Deep Deterministic Policy Gradient (DDPG). @@ -26,6 +27,7 @@ class DDPG(Agent): replay_start_size (int): Number of experiences in replay buffer when training begins. update_frequency (int): Number of timesteps per training update. """ + def __init__(self, q, policy, @@ -91,3 +93,11 @@ def _train(self): def _should_train(self): self._frames_seen += 1 return self._frames_seen > self.replay_start_size and self._frames_seen % self.update_frequency == 0 + + +class DDPGTestAgent(Agent): + def __init__(self, policy): + self.policy = policy + + def act(self, state): + return self.policy.eval(state) diff --git a/all/agents/ddqn.py b/all/agents/ddqn.py index 0894f1d0..0f9a1acc 100644 --- a/all/agents/ddqn.py +++ b/all/agents/ddqn.py @@ -1,6 +1,7 @@ import torch from all.nn import weighted_mse_loss from ._agent import Agent +from .dqn import DQNTestAgent class DDQN(Agent): @@ -24,6 +25,7 @@ class DDQN(Agent): replay_start_size (int): Number of experiences in replay buffer when training begins. update_frequency (int): Number of timesteps per training update. ''' + def __init__(self, q, policy, @@ -79,3 +81,6 @@ def _train(self): def _should_train(self): self._frames_seen += 1 return self._frames_seen > self.replay_start_size and self._frames_seen % self.update_frequency == 0 + + +DDQNTestAgent = DQNTestAgent diff --git a/all/agents/dqn.py b/all/agents/dqn.py index 6930893b..f7cb1ef7 100644 --- a/all/agents/dqn.py +++ b/all/agents/dqn.py @@ -1,3 +1,4 @@ +import numpy as np import torch from torch.nn.functional import mse_loss from ._agent import Agent @@ -25,6 +26,7 @@ class DQN(Agent): replay_start_size (int): Number of experiences in replay buffer when training begins. update_frequency (int): Number of timesteps per training update. ''' + def __init__(self, q, policy, @@ -75,5 +77,16 @@ def _train(self): def _should_train(self): self._frames_seen += 1 - return (self._frames_seen > self.replay_start_size and - self._frames_seen % self.update_frequency == 0) + return (self._frames_seen > self.replay_start_size and self._frames_seen % self.update_frequency == 0) + + +class DQNTestAgent(Agent): + def __init__(self, q, n_actions, exploration=0.): + self.q = q + self.n_actions = n_actions + self.exploration = 0.001 + + def act(self, state): + if np.random.rand() < self.exploration: + return np.random.randint(0, self.n_actions) + return torch.argmax(self.q.eval(state)).item() diff --git a/all/agents/independent.py b/all/agents/independent.py new file mode 100644 index 00000000..32c3275f --- /dev/null +++ b/all/agents/independent.py @@ -0,0 +1,9 @@ +from ._multiagent import Multiagent + + +class IndependentMultiagent(Multiagent): + def __init__(self, agents): + self.agents = agents + + def act(self, state): + return self.agents[state['agent']].act(state) diff --git a/all/agents/ppo.py b/all/agents/ppo.py index 589385ca..5499ebac 100644 --- a/all/agents/ppo.py +++ b/all/agents/ppo.py @@ -3,9 +3,11 @@ from all.logging import DummyWriter from all.memory import GeneralizedAdvantageBuffer from ._agent import Agent +from ._parallel_agent import ParallelAgent +from .a2c import A2CTestAgent -class PPO(Agent): +class PPO(ParallelAgent): """ Proximal Policy Optimization (PPO). PPO is an actor-critic style policy gradient algorithm that allows for the reuse of samples @@ -26,6 +28,7 @@ class PPO(Agent): n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout. writer (Writer): Used for logging. """ + def __init__( self, features, @@ -138,4 +141,6 @@ def _make_buffer(self): discount_factor=self.discount_factor, lam=self.lam ) - \ No newline at end of file + + +PPOTestAgent = A2CTestAgent diff --git a/all/agents/rainbow.py b/all/agents/rainbow.py index 413a0f5f..665603f1 100644 --- a/all/agents/rainbow.py +++ b/all/agents/rainbow.py @@ -1,4 +1,5 @@ -from .c51 import C51 +from .c51 import C51, C51TestAgent + class Rainbow(C51): """ @@ -28,3 +29,6 @@ class Rainbow(C51): when training begins. update_frequency (int): Number of timesteps per training update. """ + + +RainbowTestAgent = C51TestAgent diff --git a/all/agents/sac.py b/all/agents/sac.py index 4c3e22f1..562b88a2 100644 --- a/all/agents/sac.py +++ b/all/agents/sac.py @@ -3,6 +3,7 @@ from all.logging import DummyWriter from ._agent import Agent + class SAC(Agent): """ Soft Actor-Critic (SAC). @@ -28,6 +29,7 @@ class SAC(Agent): temperature_initial (float): The initial temperature used in the maximum entropy objective. update_frequency (int): Number of timesteps per training update. """ + def __init__(self, policy, q_1, @@ -70,9 +72,6 @@ def act(self, state): self._action = self.policy.no_grad(state)[0] return self._action - def eval(self, state): - return self.policy.eval(state) - def _train(self): if self._should_train(): # sample from replay buffer @@ -111,3 +110,11 @@ def _train(self): def _should_train(self): self._frames_seen += 1 return self._frames_seen > self.replay_start_size and self._frames_seen % self.update_frequency == 0 + + +class SACTestAgent(Agent): + def __init__(self, policy): + self.policy = policy + + def act(self, state): + return self.policy.eval(state) diff --git a/all/agents/vac.py b/all/agents/vac.py index dea134fc..4c5bd48a 100644 --- a/all/agents/vac.py +++ b/all/agents/vac.py @@ -1,8 +1,10 @@ from torch.nn.functional import mse_loss from ._agent import Agent +from ._parallel_agent import ParallelAgent +from .a2c import A2CTestAgent -class VAC(Agent): +class VAC(ParallelAgent): ''' Vanilla Actor-Critic (VAC). VAC is an implementation of the actor-critic alogorithm found in the Sutton and Barto (2018) textbook. @@ -19,6 +21,7 @@ class VAC(Agent): n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout. writer (Writer): Used for logging. ''' + def __init__(self, features, v, policy, discount_factor=1): self.features = features self.v = v @@ -55,3 +58,6 @@ def _train(self, state, reward): self.v.reinforce(value_loss) self.policy.reinforce(policy_loss) self.features.reinforce() + + +VACTestAgent = A2CTestAgent diff --git a/all/agents/vpg.py b/all/agents/vpg.py index 1db69df9..f8052bda 100644 --- a/all/agents/vpg.py +++ b/all/agents/vpg.py @@ -2,6 +2,8 @@ from torch.nn.functional import mse_loss from all.core import State from ._agent import Agent +from .a2c import A2CTestAgent + class VPG(Agent): ''' @@ -24,6 +26,7 @@ class VPG(Agent): this many state-action pairs are seen. Set this to a large value in order to train on multiple episodes at once. ''' + def __init__( self, features, @@ -128,3 +131,6 @@ def _compute_discounted_returns(self, rewards): returns[t] = discounted_return t -= 1 return returns + + +VPGTestAgent = A2CTestAgent diff --git a/all/agents/vqn.py b/all/agents/vqn.py index ff2f9e67..5193081c 100644 --- a/all/agents/vqn.py +++ b/all/agents/vqn.py @@ -1,9 +1,11 @@ import torch from torch.nn.functional import mse_loss from ._agent import Agent +from ._parallel_agent import ParallelAgent +from .dqn import DQNTestAgent -class VQN(Agent): +class VQN(ParallelAgent): ''' Vanilla Q-Network (VQN). VQN is an implementation of the Q-learning algorithm found in the Sutton and Barto (2018) textbook. @@ -18,6 +20,7 @@ class VQN(Agent): policy (GreedyPolicy): A policy derived from the Q-function. discount_factor (float): Discount factor for future rewards. ''' + def __init__(self, q, policy, discount_factor=0.99): self.q = q self.policy = policy @@ -45,3 +48,6 @@ def _train(self, reward, next_state): loss = mse_loss(value, target) # backward pass self.q.reinforce(loss) + + +VQNTestAgent = DQNTestAgent diff --git a/all/agents/vsarsa.py b/all/agents/vsarsa.py index 28443a00..df0b35b3 100644 --- a/all/agents/vsarsa.py +++ b/all/agents/vsarsa.py @@ -1,8 +1,10 @@ from torch.nn.functional import mse_loss from ._agent import Agent +from ._parallel_agent import ParallelAgent +from .dqn import DQNTestAgent -class VSarsa(Agent): +class VSarsa(ParallelAgent): ''' Vanilla SARSA (VSarsa). SARSA (State-Action-Reward-State-Action) is an on-policy alternative to Q-learning. Unlike Q-learning, @@ -15,6 +17,7 @@ class VSarsa(Agent): policy (GreedyPolicy): A policy derived from the Q-function. discount_factor (float): Discount factor for future rewards. ''' + def __init__(self, q, policy, discount_factor=0.99): self.q = q self.policy = policy @@ -42,3 +45,6 @@ def _train(self, reward, next_state, next_action): loss = mse_loss(value, target) # backward pass self.q.reinforce(loss) + + +VSarsaTestAgent = DQNTestAgent diff --git a/all/approximation/.DS_Store b/all/approximation/.DS_Store new file mode 100644 index 00000000..88d51bb2 Binary files /dev/null and b/all/approximation/.DS_Store differ diff --git a/all/approximation/__init__.py b/all/approximation/__init__.py index 24889649..2890f66c 100644 --- a/all/approximation/__init__.py +++ b/all/approximation/__init__.py @@ -4,6 +4,7 @@ from .q_network import QNetwork from .v_network import VNetwork from .feature_network import FeatureNetwork +from .identity import Identity from .target import TargetNetwork, FixedTarget, PolyakTarget, TrivialTarget from .checkpointer import Checkpointer, DummyCheckpointer, PeriodicCheckpointer @@ -16,10 +17,11 @@ "VNetwork", "FeatureNetwork", "TargetNetwork", + "Identity", "FixedTarget", "PolyakTarget", "TrivialTarget", "Checkpointer", "DummyCheckpointer", - "PeriodicCheckpointer" + "PeriodicCheckpointer", ] diff --git a/all/approximation/approximation.py b/all/approximation/approximation.py index 7080dda5..4b45f7ee 100644 --- a/all/approximation/approximation.py +++ b/all/approximation/approximation.py @@ -3,10 +3,11 @@ from torch.nn import utils from all.logging import DummyWriter from .target import TrivialTarget -from .checkpointer import PeriodicCheckpointer +from .checkpointer import DummyCheckpointer DEFAULT_CHECKPOINT_FREQUENCY = 200 + class Approximation(): ''' Base function approximation object. @@ -31,6 +32,8 @@ class Approximation(): gradient to this value in order prevent large updates and improve stability. See torch.nn.utils.clip_grad. + device (string, optional): The device that the model is on. If none is passed, + the device will be automatically determined based on model.parameters() loss_scaling (float, optional): Multiplies the loss by this value before performing a backwards pass. Useful when used with multi-headed networks with shared feature layers. @@ -46,12 +49,14 @@ class Approximation(): The standard object logs to tensorboard, however, other types of Writer objects may be implemented by the user. ''' + def __init__( self, model, - optimizer, + optimizer=None, checkpointer=None, clip_grad=0, + device=None, loss_scaling=1, name='approximation', scheduler=None, @@ -59,7 +64,7 @@ def __init__( writer=DummyWriter(), ): self.model = model - self.device = next(model.parameters()).device + self.device = device if device else next(model.parameters()).device self._target = target or TrivialTarget() self._scheduler = scheduler self._target.init(model) @@ -72,7 +77,7 @@ def __init__( self._name = name if checkpointer is None: - checkpointer = PeriodicCheckpointer(DEFAULT_CHECKPOINT_FREQUENCY) + checkpointer = DummyCheckpointer() self._checkpointer = checkpointer self._checkpointer.init( self.model, diff --git a/all/approximation/feature_network.py b/all/approximation/feature_network.py index 43edf5f6..ca9eea89 100644 --- a/all/approximation/feature_network.py +++ b/all/approximation/feature_network.py @@ -12,6 +12,7 @@ class FeatureNetwork(Approximation): The reinforce() function will then backpropagate the accumulated gradients on the output through the original computation graph. ''' + def __init__(self, model, optimizer=None, name='feature', **kwargs): model = FeatureModule(model) super().__init__(model, optimizer, name=name, **kwargs) @@ -26,11 +27,10 @@ def __call__(self, states): state (all.environment.State): An environment State Returns: - all.environment.State: An enviornment State with the computed features + all.environment.State: An environment State with the computed features ''' features = self.model(states) graphs = features.observation - # pylint: disable=protected-access observation = graphs.detach() observation.requires_grad = True features['observation'] = observation @@ -42,8 +42,9 @@ def reinforce(self): Backward pass of the model. ''' graphs, grads = self._dequeue() - graphs.backward(grads) - self.step() + if graphs.requires_grad: + graphs.backward(grads) + self.step() def _enqueue(self, features, out): self._cache.append(features) @@ -60,6 +61,7 @@ def _dequeue(self): self._out = [] return torch.cat(graphs), torch.cat(grads) + class FeatureModule(torch.nn.Module): def __init__(self, model): super().__init__() diff --git a/all/approximation/feature_network_test.py b/all/approximation/feature_network_test.py index 6540af36..e3f89979 100644 --- a/all/approximation/feature_network_test.py +++ b/all/approximation/feature_network_test.py @@ -60,6 +60,20 @@ def assert_state_equal(self, actual, expected): tt.assert_almost_equal(actual.observation, expected.observation, decimal=2) tt.assert_equal(actual.mask, expected.mask) + def test_identity_features(self): + model = nn.Sequential(nn.Identity()) + features = FeatureNetwork(model, None, device='cpu') + + # forward pass + x = State({'observation': torch.tensor([1., 2., 3.])}) + y = features(x) + tt.assert_equal(y.observation, x.observation) + + # backward pass shouldn't raise exception + loss = y.observation.sum() + loss.backward() + features.reinforce() + if __name__ == "__main__": unittest.main() diff --git a/all/approximation/identity.py b/all/approximation/identity.py new file mode 100644 index 00000000..77353b9b --- /dev/null +++ b/all/approximation/identity.py @@ -0,0 +1,20 @@ +import torch +from torch import nn +from .approximation import Approximation + + +class Identity(Approximation): + ''' + An Approximation that represents the identity function. + + Because the model has no parameters, reinforce and step do nothing. + ''' + + def __init__(self, device, name='identity', **kwargs): + super().__init__(nn.Identity(), None, device=device, name=name, **kwargs) + + def reinforce(self): + return self + + def step(self): + return self diff --git a/all/approximation/identity_test.py b/all/approximation/identity_test.py new file mode 100644 index 00000000..7af58afd --- /dev/null +++ b/all/approximation/identity_test.py @@ -0,0 +1,42 @@ +import unittest +import torch +import torch_testing as tt +from all.core import State +from all.approximation import Identity, FixedTarget + + +class TestIdentityNetwork(unittest.TestCase): + def setUp(self): + self.model = Identity('cpu', target=FixedTarget(10)) + + def test_forward_tensor(self): + inputs = torch.tensor([1, 2, 3]) + outputs = self.model(inputs) + tt.assert_equal(inputs, outputs) + + def test_forward_state(self): + inputs = State({ + 'observation': torch.tensor([1, 2, 3]) + }) + outputs = self.model(inputs) + self.assertEqual(inputs, outputs) + + def test_eval(self): + inputs = torch.tensor([1, 2, 3]) + outputs = self.model.target(inputs) + tt.assert_equal(inputs, outputs) + + def test_target(self): + inputs = torch.tensor([1, 2, 3]) + outputs = self.model.target(inputs) + tt.assert_equal(inputs, outputs) + + def test_reinforce(self): + self.model.reinforce() + + def test_step(self): + self.model.step() + + +if __name__ == "__main__": + unittest.main() diff --git a/all/approximation/q_continuous.py b/all/approximation/q_continuous.py index 889f504f..29dc754d 100644 --- a/all/approximation/q_continuous.py +++ b/all/approximation/q_continuous.py @@ -2,6 +2,7 @@ from all.nn import RLNetwork from .approximation import Approximation + class QContinuous(Approximation): def __init__( self, @@ -18,6 +19,7 @@ def __init__( **kwargs ) + class QContinuousModule(RLNetwork): def forward(self, states, actions): x = torch.cat((states.observation.float(), actions), dim=1) diff --git a/all/approximation/q_dist.py b/all/approximation/q_dist.py index d3afe768..eb195169 100644 --- a/all/approximation/q_dist.py +++ b/all/approximation/q_dist.py @@ -23,7 +23,6 @@ def __init__( super().__init__(model, optimizer, name=name, **kwargs) def project(self, dist, support): - # pylint: disable=invalid-name target_dist = dist * 0 atoms = self.atoms v_min = atoms[0] diff --git a/all/approximation/q_dist_test.py b/all/approximation/q_dist_test.py index b8027e97..778db4ee 100644 --- a/all/approximation/q_dist_test.py +++ b/all/approximation/q_dist_test.py @@ -159,24 +159,24 @@ def test_project_dist(self): [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030], + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030], [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030], + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030], [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030] + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030] ]) expected = torch.tensor([ [0.0049, 0.0198, 0.0204, 0.0202, 0.0198, 0.0202, 0.0202, 0.0199, 0.0202, @@ -229,24 +229,24 @@ def test_project_dist_cuda(self): [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030], + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030], [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030], + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030], [-9.7030, -9.3149, -8.9268, -8.5386, -8.1505, -7.7624, -7.3743, -6.9862, -6.5980, -6.2099, -5.8218, -5.4337, -5.0456, -4.6574, -4.2693, -3.8812, -3.4931, -3.1050, -2.7168, -2.3287, -1.9406, -1.5525, -1.1644, -0.7762, - -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, - 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, - 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, - 8.9268, 9.3149, 9.7030] + -0.3881, 0.0000, 0.3881, 0.7762, 1.1644, 1.5525, 1.9406, 2.3287, + 2.7168, 3.1050, 3.4931, 3.8812, 4.2693, 4.6574, 5.0456, 5.4337, + 5.8218, 6.2099, 6.5980, 6.9862, 7.3743, 7.7624, 8.1505, 8.5386, + 8.9268, 9.3149, 9.7030] ]).cuda() expected = torch.tensor([ [0.0049, 0.0198, 0.0204, 0.0202, 0.0198, 0.0202, 0.0202, 0.0199, 0.0202, @@ -270,5 +270,6 @@ def test_project_dist_cuda(self): ]) tt.assert_almost_equal(q.project(dist, support).cpu(), expected.cpu(), decimal=3) + if __name__ == "__main__": unittest.main() diff --git a/all/approximation/q_network.py b/all/approximation/q_network.py index 435a69d4..6cbe999c 100644 --- a/all/approximation/q_network.py +++ b/all/approximation/q_network.py @@ -2,11 +2,12 @@ from all.nn import RLNetwork from .approximation import Approximation + class QNetwork(Approximation): def __init__( self, model, - optimizer, + optimizer=None, name='q', **kwargs ): @@ -18,6 +19,7 @@ def __init__( **kwargs ) + class QModule(RLNetwork): def forward(self, states, actions=None): values = super().forward(states) diff --git a/all/approximation/q_network_test.py b/all/approximation/q_network_test.py index df11a8c8..dee5a061 100644 --- a/all/approximation/q_network_test.py +++ b/all/approximation/q_network_test.py @@ -10,12 +10,14 @@ STATE_DIM = 2 ACTIONS = 3 + class TestQNetwork(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential( nn.Linear(STATE_DIM, ACTIONS) ) + def optimizer(params): return torch.optim.SGD(params, lr=0.1) self.q = QNetwork(self.model, optimizer) @@ -46,7 +48,6 @@ def test_eval_actions(self): self.assertEqual(result.shape, torch.Size([3])) tt.assert_almost_equal(result, torch.tensor([-0.7262873, 0.3484948, -0.0296164])) - def test_target_net(self): torch.manual_seed(2) model = nn.Sequential( @@ -93,5 +94,6 @@ def loss(policy_value): np.testing.assert_equal(policy_value.item(), -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201) + if __name__ == '__main__': unittest.main() diff --git a/all/approximation/target/abstract.py b/all/approximation/target/abstract.py index f57650fe..31453816 100644 --- a/all/approximation/target/abstract.py +++ b/all/approximation/target/abstract.py @@ -1,6 +1,6 @@ from abc import abstractmethod, ABC -# pylint: disable=arguments-differ + class TargetNetwork(ABC): @abstractmethod def __call__(self, *inputs): diff --git a/all/approximation/target/fixed.py b/all/approximation/target/fixed.py index 236f7e73..7f6c576d 100644 --- a/all/approximation/target/fixed.py +++ b/all/approximation/target/fixed.py @@ -2,6 +2,7 @@ import torch from .abstract import TargetNetwork + class FixedTarget(TargetNetwork): def __init__(self, update_frequency): self._source = None diff --git a/all/approximation/target/polyak.py b/all/approximation/target/polyak.py index 65fa509a..2d89c1e4 100644 --- a/all/approximation/target/polyak.py +++ b/all/approximation/target/polyak.py @@ -2,8 +2,10 @@ import torch from .abstract import TargetNetwork + class PolyakTarget(TargetNetwork): '''TargetNetwork that updates using polyak averaging''' + def __init__(self, rate): self._source = None self._target = None @@ -19,7 +21,4 @@ def init(self, model): def update(self): for target_param, source_param in zip(self._target.parameters(), self._source.parameters()): - target_param.data.copy_( - target_param.data * (1.0 - self._rate) + - source_param.data * self._rate - ) + target_param.data.copy_(target_param.data * (1.0 - self._rate) + source_param.data * self._rate) diff --git a/all/approximation/target/trivial.py b/all/approximation/target/trivial.py index db9246a1..2e95b8d7 100644 --- a/all/approximation/target/trivial.py +++ b/all/approximation/target/trivial.py @@ -1,6 +1,7 @@ import torch from .abstract import TargetNetwork + class TrivialTarget(TargetNetwork): def __init__(self): self._model = None diff --git a/all/approximation/v_network.py b/all/approximation/v_network.py index 1bf3cc1c..6df364c1 100644 --- a/all/approximation/v_network.py +++ b/all/approximation/v_network.py @@ -1,6 +1,7 @@ from all.nn import RLNetwork from .approximation import Approximation + class VNetwork(Approximation): def __init__( self, @@ -17,6 +18,7 @@ def __init__( **kwargs ) + class VModule(RLNetwork): def forward(self, states): return super().forward(states).squeeze(-1) diff --git a/all/approximation/v_network_test.py b/all/approximation/v_network_test.py index 49283d1d..cc281ed7 100644 --- a/all/approximation/v_network_test.py +++ b/all/approximation/v_network_test.py @@ -7,10 +7,12 @@ STATE_DIM = 2 + def loss(value, error): target = value + error return ((target.detach() - value) ** 2).mean() + class TestVNetwork(unittest.TestCase): def setUp(self): torch.manual_seed(2) @@ -49,5 +51,6 @@ def test_multi_reinforce(self): with self.assertRaises(Exception): self.v.reinforce(loss(result3, torch.tensor([1, 2])).float()) + if __name__ == '__main__': unittest.main() diff --git a/all/bodies/_body.py b/all/bodies/_body.py index 04906c61..12f7ceb1 100644 --- a/all/bodies/_body.py +++ b/all/bodies/_body.py @@ -1,8 +1,9 @@ from all.agents import Agent + class Body(Agent): """ - A Body wraps a reinforcment learning Agent, altering its inputs and ouputs. + A Body wraps a reinforcement learning Agent, altering its inputs and outputs. The Body API is identical to the Agent API from the perspective of the rest of the system. This base class is provided only for semantic clarity. diff --git a/all/bodies/atari.py b/all/bodies/atari.py index 99a2e51c..a5fe494b 100644 --- a/all/bodies/atari.py +++ b/all/bodies/atari.py @@ -3,17 +3,20 @@ from .rewards import ClipRewards from .vision import FrameStack + class DeepmindAtariBody(Body): - def __init__(self, agent, lazy_frames=False, episodic_lives=True, frame_stack=4): + def __init__(self, agent, lazy_frames=False, episodic_lives=True, frame_stack=4, clip_rewards=True): agent = FrameStack(agent, lazy=lazy_frames, size=frame_stack) - agent = ClipRewards(agent) + if clip_rewards: + agent = ClipRewards(agent) if episodic_lives: agent = EpisodicLives(agent) super().__init__(agent) + class EpisodicLives(Body): def process_state(self, state): - if not 'life_lost' in state: + if 'life_lost' not in state: return state if len(state) == 1: diff --git a/all/bodies/rewards.py b/all/bodies/rewards.py index 43c30e87..653bb508 100644 --- a/all/bodies/rewards.py +++ b/all/bodies/rewards.py @@ -2,6 +2,7 @@ import numpy as np from ._body import Body + class ClipRewards(Body): def process_state(self, state): return state.update('reward', self._clip(state.reward)) diff --git a/all/bodies/time.py b/all/bodies/time.py index 72ce4d38..a11acc21 100644 --- a/all/bodies/time.py +++ b/all/bodies/time.py @@ -2,6 +2,7 @@ from all.core import StateArray from ._body import Body + class TimeFeature(Body): def __init__(self, agent, scale=0.001): self.timestep = None diff --git a/all/bodies/time_test.py b/all/bodies/time_test.py index 5f88ffd4..31dc3ad8 100644 --- a/all/bodies/time_test.py +++ b/all/bodies/time_test.py @@ -74,5 +74,6 @@ def test_multi_env(self): tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor( [[0.3923, -0.2236, 4e-3], [-0.3195, -1.2050, 1e-3]]), atol=1e-04) + if __name__ == '__main__': unittest.main() diff --git a/all/bodies/vision.py b/all/bodies/vision.py index 6eee77d2..79cbc2ca 100644 --- a/all/bodies/vision.py +++ b/all/bodies/vision.py @@ -2,12 +2,14 @@ from all.core import State, StateArray from ._body import Body + class FrameStack(Body): def __init__(self, agent, size=4, lazy=False): super().__init__(agent) self._frames = [] self._size = size self._lazy = lazy + self._to_cache = TensorDeviceCache() def process_state(self, state): if not self._frames: @@ -15,15 +17,43 @@ def process_state(self, state): else: self._frames = self._frames[1:] + [state.observation] if self._lazy: - return LazyState.from_state(state, self._frames) + return LazyState.from_state(state, self._frames, self._to_cache) if isinstance(state, StateArray): return state.update('observation', torch.cat(self._frames, dim=1)) return state.update('observation', torch.cat(self._frames, dim=0)) + +class TensorDeviceCache: + ''' + To efficiently implement device trasfer of lazy states, this class + caches the transfered tensor so that it is not copied multiple times. + ''' + + def __init__(self, max_size=16): + self.max_size = max_size + self.cache_data = [] + + def convert(self, value, device): + cached = None + for el in self.cache_data: + if el[0] is value: + cached = el[1] + break + if cached is not None and cached.device == torch.device(device): + new_v = cached + else: + new_v = value.to(device) + self.cache_data.append((value, new_v)) + if len(self.cache_data) > self.max_size: + self.cache_data.pop(0) + return new_v + + class LazyState(State): @classmethod - def from_state(cls, state, frames): - state = LazyState(state, device=state.device) + def from_state(cls, state, frames, to_cache): + state = LazyState(state, device=frames[0].device) + state.to_cache = to_cache state['observation'] = frames return state @@ -34,3 +64,28 @@ def __getitem__(self, key): return v return torch.cat(dict.__getitem__(self, key), dim=0) return super().__getitem__(key) + + def update(self, key, value): + x = {} + for k in self.keys(): + if not k == key: + x[k] = super().__getitem__(k) + x[key] = value + state = LazyState(x, device=self.device) + state.to_cache = self.to_cache + return state + + def to(self, device): + if device == self.device: + return self + x = {} + for key, value in self.items(): + if key == 'observation': + x[key] = [self.to_cache.convert(v, device) for v in value] + # x[key] = [v.to(device) for v in value]#torch.cat(value,axis=0).to(device) + elif torch.is_tensor(value): + x[key] = value.to(device) + else: + x[key] = value + state = LazyState.from_state(x, x['observation'], self.to_cache) + return state diff --git a/all/core/__init__.py b/all/core/__init__.py index 71152483..540c3800 100644 --- a/all/core/__init__.py +++ b/all/core/__init__.py @@ -1,3 +1,3 @@ -from .state import State, StateArray +from .state import State, StateArray, MultiagentState -__all__ = ['State', 'StateArray'] +__all__ = ['State', 'StateArray', 'MultiagentState'] diff --git a/all/core/state.py b/all/core/state.py index 9e76dbac..bc566049 100644 --- a/all/core/state.py +++ b/all/core/state.py @@ -1,5 +1,6 @@ import numpy as np import torch +import warnings class State(dict): @@ -31,6 +32,7 @@ class State(dict): device (string): The torch device on which component tensors are stored. """ + def __init__(self, x, device='cpu', **kwargs): if not isinstance(x, dict): x = {'observation': x} @@ -64,15 +66,23 @@ def array(cls, list_of_states): device = list_of_states[0].device shape = (len(list_of_states), *list_of_states[0].shape) x = {} + for key in list_of_states[0].keys(): v = list_of_states[0][key] try: - if torch.is_tensor(v): + if isinstance(v, list) and len(v) > 0 and torch.is_tensor(v[0]): + x[key] = torch.stack([torch.stack(state[key]) for state in list_of_states]) + elif torch.is_tensor(v): x[key] = torch.stack([state[key] for state in list_of_states]) else: x[key] = torch.tensor([state[key] for state in list_of_states], device=device) - except: # # pylint: disable=bare-except - pass + except KeyError: + warnings.warn('KeyError while creating StateArray for key "{}", omitting.'.format(key)) + except ValueError: + warnings.warn('ValueError while creating StateArray for key "{}", omitting.'.format(key)) + except TypeError: + warnings.warn('TypeError while creating StateArray for key "{}", omitting.'.format(key)) + return StateArray(x, shape, device=device) def apply(self, model, *keys): @@ -187,6 +197,17 @@ def from_gym(cls, state, device='cpu', dtype=np.float32): x[key] = info[key] return State(x, device=device) + def to(self, device): + if device == self.device: + return self + x = {} + for key, value in self.items(): + if torch.is_tensor(value): + x[key] = value.to(device) + else: + x[key] = value + return type(self)(x, device=device, shape=self._shape) + @property def observation(self): """A tensor containing the current observation.""" @@ -215,6 +236,7 @@ def shape(self): def __len__(self): return 1 + class StateArray(State): """ An n-dimensional array of environment State objects. @@ -244,6 +266,7 @@ class StateArray(State): device (string): The torch device on which component tensors are stored. """ + def __init__(self, x, shape, device='cpu', **kwargs): if not isinstance(x, dict): x = {'observation': x} @@ -288,7 +311,7 @@ def as_output(self, tensor): return tensor.view((*self.shape, *tensor.shape[1:])) def apply_mask(self, tensor): - return tensor * self.mask.unsqueeze(-1) # pylint: disable=no-member + return tensor * self.mask.unsqueeze(-1) def flatten(self): """ @@ -337,9 +360,9 @@ def mask(self): def __getitem__(self, key): if isinstance(key, slice): shape = self['mask'][key].shape - return StateArray({k:v[key] for (k, v) in self.items()}, shape, device=self.device) + return StateArray({k: v[key] for (k, v) in self.items()}, shape, device=self.device) if isinstance(key, int): - return State({k:v[key] for (k, v) in self.items()}, device=self.device) + return State({k: v[key] for (k, v) in self.items()}, device=self.device) if torch.is_tensor(key): # some things may get los d = {} @@ -347,7 +370,7 @@ def __getitem__(self, key): for (k, v) in self.items(): try: d[k] = v[key] - except: # pylint: disable=bare-except + except KeyError: pass return self.__class__(d, shape, device=self.device) try: @@ -363,3 +386,67 @@ def shape(self): def __len__(self): return self.shape[0] + + +class MultiagentState(State): + def __init__(self, x, device='cpu', **kwargs): + if 'agent' not in x: + raise Exception('MultiagentState must contain an agent ID') + super().__init__(x, device=device, **kwargs) + + @property + def agent(self): + return self['agent'] + + @classmethod + def from_zoo(cls, agent, state, device='cpu', dtype=np.float32): + """ + Constructs a State object given the return value of an OpenAI gym reset()/step(action) call. + + Args: + state (tuple): The return value of an OpenAI gym reset()/step(action) call + device (string): The device on which to store resulting tensors. + dtype: The type of the observation. + + Returns: + A State object. + """ + if not isinstance(state, tuple): + return MultiagentState({ + 'agent': agent, + 'observation': torch.from_numpy( + np.array( + state, + dtype=dtype + ), + ).to(device) + }, device=device) + + observation, reward, done, info = state + observation = torch.from_numpy( + np.array( + observation, + dtype=dtype + ), + ).to(device) + x = { + 'agent': agent, + 'observation': observation, + 'reward': float(reward), + 'done': done, + } + info = info if info else {} + for key in info: + x[key] = info[key] + return MultiagentState(x, device=device) + + def to(self, device): + if device == self.device: + return self + x = {} + for key, value in self.items(): + if torch.is_tensor(value): + x[key] = value.to(device) + else: + x[key] = value + return type(self)(x, device=device, shape=self._shape) diff --git a/all/core/state_test.py b/all/core/state_test.py index a1005c97..f06befc2 100644 --- a/all/core/state_test.py +++ b/all/core/state_test.py @@ -1,9 +1,11 @@ import unittest +import warnings import numpy as np import torch import torch_testing as tt from all.core import State, StateArray + class StateTest(unittest.TestCase): def test_constructor_defaults(self): observation = torch.randn(3, 4) @@ -88,7 +90,6 @@ def test_apply(self): self.assertEqual(output.shape, (5, 3)) self.assertNotEqual(output.sum().item(), 0) - def test_apply_done(self): observation = torch.randn(3, 4) state = State.from_gym((observation, 0., True, {})) @@ -97,6 +98,14 @@ def test_apply_done(self): self.assertEqual(output.shape, (5, 3)) self.assertEqual(output.sum().item(), 0) + def test_to_device(self): + observation = torch.randn(3, 4) + state = State(observation, device=torch.device('cpu')) + state_cpu = state.to("cpu") + self.assertTrue(torch.equal(state['observation'], state_cpu['observation'])) + self.assertFalse(state is state_cpu) + + class StateArrayTest(unittest.TestCase): def test_constructor_defaults(self): raw = torch.randn(3, 4) @@ -122,7 +131,6 @@ def test_apply_done(self): self.assertEqual(output.shape, (3, 2)) self.assertEqual(output.sum().item(), 0) - def test_as_output(self): observation = torch.randn(3, 4) state = StateArray(observation, (3,)) @@ -163,6 +171,37 @@ def test_view(self): self.assertEqual(state.shape, (2, 3)) self.assertEqual(state.observation.shape, (2, 3, 3, 4)) + def test_key_error(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + StateArray.array([ + State({ + 'observation': torch.tensor([1, 2]), + 'other_key': True + }), + State({ + 'observation': torch.tensor([1, 2]), + }), + ]) + self.assertEqual(len(w), 1) + self.assertEqual(w[0].message.args[0], 'KeyError while creating StateArray for key "other_key", omitting.') + + def test_type_error(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + StateArray.array([ + State({ + 'observation': torch.tensor([1, 2]), + 'other_key': torch.tensor([1]) + }), + State({ + 'observation': torch.tensor([1, 2]), + 'other_key': 5. + }), + ]) + self.assertEqual(len(w), 1) + self.assertEqual(w[0].message.args[0], 'TypeError while creating StateArray for key "other_key", omitting.') + if __name__ == "__main__": unittest.main() diff --git a/all/environments/__init__.py b/all/environments/__init__.py index adcd78a3..b6e331e8 100644 --- a/all/environments/__init__.py +++ b/all/environments/__init__.py @@ -1,5 +1,17 @@ -from .abstract import Environment +from ._environment import Environment +from._multiagent_environment import MultiagentEnvironment from .gym import GymEnvironment from .atari import AtariEnvironment +from .multiagent_atari import MultiagentAtariEnv +from .multiagent_pettingzoo import MultiagentPettingZooEnv +from .pybullet import PybulletEnvironment -__all__ = ["Environment", "GymEnvironment", "AtariEnvironment"] +__all__ = [ + "Environment", + "MultiagentEnvironment", + "GymEnvironment", + "AtariEnvironment", + "MultiagentAtariEnv", + "MultiagentPettingZooEnv", + "PybulletEnvironment", +] diff --git a/all/environments/abstract.py b/all/environments/_environment.py similarity index 94% rename from all/environments/abstract.py rename to all/environments/_environment.py index d4de6fa6..36305b70 100644 --- a/all/environments/abstract.py +++ b/all/environments/_environment.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod + class Environment(ABC): """ A reinforcement learning Environment. @@ -21,7 +22,7 @@ def name(self): @abstractmethod def reset(self): """ - Reset the environment and return a new intial state. + Reset the environment and return a new initial state. Returns ------- @@ -57,7 +58,7 @@ def render(self, **kwargs): @abstractmethod def close(self): """ - Clean up any extraneaous environment objects. + Clean up any extraneous environment objects. """ @property @@ -82,7 +83,7 @@ def state_space(self): @property def observation_space(self): """ - Alias for Environemnt.state_space. + Alias for Environment.state_space. Returns ------- diff --git a/all/environments/_multiagent_environment.py b/all/environments/_multiagent_environment.py new file mode 100644 index 00000000..34470af0 --- /dev/null +++ b/all/environments/_multiagent_environment.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod + + +class MultiagentEnvironment(ABC): + ''' + A multiagent reinforcement learning Environment. + + The Multiagent variant of the Environment object. + An Environment defines the dynamics of a particular problem: + the states, the actions, the transitions between states, and the rewards given to the agent. + Environments are often used to benchmark reinforcement learning agents, + or to define real problems that the user hopes to solve using reinforcement learning. + ''' + + @abstractmethod + def reset(self): + ''' + Reset the environment and return a new initial state for the first agent. + + Returns + all.core.MultiagentState: The initial state for the next episode. + ''' + + @abstractmethod + def step(self, action): + ''' + Apply an action for the current agent and get the multiagent state for the next agent. + + Parameters: + action: The Action for the current agent and timestep. + + Returns: + all.core.MultiagentState: The state for the next agent. + ''' + + @abstractmethod + def render(self, **kwargs): + '''Render the current environment state.''' + + @abstractmethod + def close(self): + '''Clean up any extraneous environment objects.''' + + @abstractmethod + def agent_iter(self): + ''' + Create an iterable which that the next element is always the name of the agent whose turn it is to act. + + Returns: + An Iterable over Agent strings. + ''' + + @abstractmethod + def last(self): + ''' + Get the MultiagentState object for the current agent. + + Returns: + The all.core.MultiagentState object for the current agent. + ''' + + @abstractmethod + def is_done(self, agent): + ''' + Determine whether a given agent is done. + + Args: + agent (str): The name of the agent. + + Returns: + A boolean representing whether the given agent is done. + ''' + + @property + def state(self): + '''The State for the current agent.''' + return self.last() + + @property + @abstractmethod + def name(self): + '''str: The name of the environment.''' + + @property + @abstractmethod + def state_spaces(self): + '''A dictionary of state spaces for each agent.''' + + @property + def observation_spaces(self): + '''Alias for MultiagentEnvironment.state_spaces.''' + return self.state_space + + @property + @abstractmethod + def action_spaces(self): + '''A dictionary of action spaces for each agent.''' + + @property + @abstractmethod + def device(self): + ''' + The torch device the environment lives on. + ''' diff --git a/all/environments/atari.py b/all/environments/atari.py index 8f3dd689..b5fca5a7 100644 --- a/all/environments/atari.py +++ b/all/environments/atari.py @@ -7,6 +7,7 @@ WarpFrame, LifeLostEnv, ) +from all.core import State class AtariEnvironment(GymEnvironment): @@ -31,6 +32,11 @@ def __init__(self, name, *args, **kwargs): def name(self): return self._name + def reset(self): + state = self._env.reset(), 0., False, {'life_lost': False} + self._state = State.from_gym(state, dtype=self._env.observation_space.dtype, device=self._device) + return self._state + def duplicate(self, n): return [ AtariEnvironment(self._name, *self._args, **self._kwargs) for _ in range(n) diff --git a/all/environments/atari_test.py b/all/environments/atari_test.py index 143e7481..4b6bf4aa 100644 --- a/all/environments/atari_test.py +++ b/all/environments/atari_test.py @@ -21,7 +21,6 @@ def test_step(self): self.assertEqual(state.mask, 1) self.assertEqual(state['life_lost'], False) - def test_step_until_life_lost(self): env = AtariEnvironment('Breakout') env.reset() @@ -35,7 +34,6 @@ def test_step_until_life_lost(self): self.assertEqual(state.mask, 1) self.assertEqual(state['life_lost'], True) - def test_step_until_done(self): env = AtariEnvironment('Breakout') env.reset() diff --git a/all/environments/atari_wrappers.py b/all/environments/atari_wrappers.py index 9d1ff280..3d2b89d0 100644 --- a/all/environments/atari_wrappers.py +++ b/all/environments/atari_wrappers.py @@ -1,6 +1,5 @@ -# pylint: skip-file ''' -A subset of Atari wraooers modified from: +A subset of Atari wrappers modified from: https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py Other behaviors were implemented as Bodies. ''' @@ -43,18 +42,19 @@ def reset(self, **kwargs): def step(self, ac): return self.env.step(ac) + class FireResetEnv(gym.Wrapper): def __init__(self, env): ''' Take action on reset for environments that are fixed until firing. - + Important: This was modified to also fire on lives lost. ''' gym.Wrapper.__init__(self, env) assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 self.lives = 0 - self.was_real_done = True + self.was_real_done = True def reset(self, **kwargs): self.env.reset(**kwargs) @@ -64,8 +64,7 @@ def reset(self, **kwargs): def step(self, action): obs, reward, done, info = self.env.step(action) - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + if self.lost_life(): obs, done = self.fire() self.lives = self.env.unwrapped.ale.lives() return obs, reward, done, info @@ -73,14 +72,15 @@ def step(self, action): def fire(self): obs, _, done, _ = self.env.step(1) if done: - self.env.reset(**kwargs) + self.env.reset() obs, _, done, _ = self.env.step(2) if done: - obs = self.env.reset(**kwargs) + obs = self.env.reset() done = False return obs, done def lost_life(self): + lives = self.env.unwrapped.ale.lives() return lives < self.lives and lives > 0 @@ -89,8 +89,8 @@ def __init__(self, env, skip=4): '''Return only every `skip`-th frame''' gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) - self._skip = skip + self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8) + self._skip = skip def step(self, action): '''Repeat action, sum reward, and max over last observations.''' @@ -98,8 +98,10 @@ def step(self, action): done = None for i in range(self._skip): obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs total_reward += reward if done: break @@ -165,6 +167,7 @@ def observation(self, obs): obs[self._key] = frame return np.moveaxis(obs, -1, 0) + class LifeLostEnv(gym.Wrapper): def __init__(self, env): ''' @@ -184,5 +187,5 @@ def step(self, action): lives = self.env.unwrapped.ale.lives() life_lost = (lives < self.lives and lives > 0) self.lives = lives - info = { 'life_lost': life_lost } + info = {'life_lost': life_lost} return obs, reward, done, info diff --git a/all/environments/gym.py b/all/environments/gym.py index 5b3a6764..7425a824 100644 --- a/all/environments/gym.py +++ b/all/environments/gym.py @@ -1,9 +1,11 @@ import gym import torch from all.core import State -from .abstract import Environment +from ._environment import Environment +import cloudpickle gym.logger.set_level(40) + class GymEnvironment(Environment): ''' A wrapper for OpenAI Gym environments (see: https://gym.openai.com). @@ -17,15 +19,18 @@ class GymEnvironment(Environment): Args: env: Either a string or an OpenAI gym environment - device (optional): the device on which tensors will be stored + name (str, optional): the name of the environment + device (str, optional): the device on which tensors will be stored ''' - def __init__(self, env, device=torch.device('cpu')): + + def __init__(self, env, device=torch.device('cpu'), name=None): if isinstance(env, str): self._name = env env = gym.make(env) else: self._name = env.__class__.__name__ - + if name: + self._name = name self._env = env self._state = None self._action = None @@ -39,7 +44,8 @@ def name(self): return self._name def reset(self): - self._state = State.from_gym(self._env.reset(), dtype=self._env.observation_space.dtype, device=self._device) + state = self._env.reset(), 0., False, None + self._state = State.from_gym(state, dtype=self._env.observation_space.dtype, device=self._device) return self._state def step(self, action): @@ -60,7 +66,7 @@ def seed(self, seed): self._env.seed(seed) def duplicate(self, n): - return [GymEnvironment(self._name, device=self.device) for _ in range(n)] + return [GymEnvironment(cloudpickle.loads(cloudpickle.dumps(self._env)), device=self.device) for _ in range(n)] @property def state_space(self): diff --git a/all/environments/gym_test.py b/all/environments/gym_test.py index 2571c052..43bf9aef 100644 --- a/all/environments/gym_test.py +++ b/all/environments/gym_test.py @@ -37,7 +37,6 @@ def test_step(self): self.assertFalse(state.done) self.assertEqual(state.mask, 1) - def test_step_until_done(self): env = GymEnvironment('CartPole-v0') env.reset() diff --git a/all/environments/multiagent_atari.py b/all/environments/multiagent_atari.py new file mode 100644 index 00000000..1cc9feb4 --- /dev/null +++ b/all/environments/multiagent_atari.py @@ -0,0 +1,34 @@ +import importlib +import numpy as np +import torch +import gym +from all.core import MultiagentState +from ._multiagent_environment import MultiagentEnvironment +from .multiagent_pettingzoo import MultiagentPettingZooEnv + + +class MultiagentAtariEnv(MultiagentPettingZooEnv): + ''' + A wrapper for PettingZoo Atari environments (see: https://www.pettingzoo.ml/atari). + + This wrapper converts the output of the PettingZoo environment to PyTorch tensors, + and wraps them in a State object that can be passed to an Agent. + + Args: + env_name (string): A string representing the name of the environment (e.g. pong-v1) + device (optional): the device on which tensors will be stored + ''' + + def __init__(self, env_name, device='cuda', **pettingzoo_params): + env = self._load_env(env_name, pettingzoo_params) + super().__init__(env, name=env_name, device=device) + + def _load_env(self, env_name, pettingzoo_params): + from pettingzoo import atari + from supersuit import resize_v0, frame_skip_v0, reshape_v0, max_observation_v0 + env = importlib.import_module('pettingzoo.atari.{}'.format(env_name)).env(obs_type='grayscale_image', **pettingzoo_params) + env = max_observation_v0(env, 2) + env = frame_skip_v0(env, 4) + env = resize_v0(env, 84, 84) + env = reshape_v0(env, (1, 84, 84)) + return env diff --git a/all/environments/multiagent_atari_test.py b/all/environments/multiagent_atari_test.py new file mode 100644 index 00000000..76f85935 --- /dev/null +++ b/all/environments/multiagent_atari_test.py @@ -0,0 +1,83 @@ +import unittest +import torch +from all.environments import MultiagentAtariEnv + + +class MultiagentAtariEnvTest(unittest.TestCase): + def test_init(self): + MultiagentAtariEnv('pong_v1', device='cpu') + MultiagentAtariEnv('mario_bros_v2', device='cpu') + MultiagentAtariEnv('entombed_cooperative_v2', device='cpu') + + def test_reset(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + state = env.reset() + self.assertEqual(state.observation.shape, (1, 84, 84)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'first_0') + + def test_step(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + env.reset() + state = env.step(0) + self.assertEqual(state.observation.shape, (1, 84, 84)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'second_0') + + def test_step_tensor(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + env.reset() + state = env.step(torch.tensor([0])) + self.assertEqual(state.observation.shape, (1, 84, 84)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'second_0') + + def test_name(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + self.assertEqual(env.name, 'pong_v1') + + def test_agent_iter(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + env.reset() + it = iter(env.agent_iter()) + self.assertEqual(next(it), 'first_0') + + def test_state_spaces(self): + state_spaces = MultiagentAtariEnv('pong_v1', device='cpu').state_spaces + self.assertEqual(state_spaces['first_0'].shape, (1, 84, 84)) + self.assertEqual(state_spaces['second_0'].shape, (1, 84, 84)) + + def test_action_spaces(self): + action_spaces = MultiagentAtariEnv('pong_v1', device='cpu').action_spaces + self.assertEqual(action_spaces['first_0'].n, 18) + self.assertEqual(action_spaces['second_0'].n, 18) + + def test_list_agents(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + self.assertEqual(env.agents, ['first_0', 'second_0']) + + def test_is_done(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + env.reset() + self.assertFalse(env.is_done('first_0')) + self.assertFalse(env.is_done('second_0')) + + def test_last(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + env.reset() + state = env.last() + self.assertEqual(state.observation.shape, (1, 84, 84)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'first_0') + + +if __name__ == "__main__": + unittest.main() diff --git a/all/environments/multiagent_pettingzoo.py b/all/environments/multiagent_pettingzoo.py new file mode 100644 index 00000000..cab4c2c6 --- /dev/null +++ b/all/environments/multiagent_pettingzoo.py @@ -0,0 +1,130 @@ +import importlib +import numpy as np +import torch +import cloudpickle +import gym +from all.core import MultiagentState +from ._multiagent_environment import MultiagentEnvironment + + +class MultiagentPettingZooEnv(MultiagentEnvironment): + ''' + A wrapper for generael PettingZoo environments (see: https://www.pettingzoo.ml/). + + This wrapper converts the output of the PettingZoo environment to PyTorch tensors, + and wraps them in a State object that can be passed to an Agent. + + Args: + zoo_env (AECEnv): A PettingZoo AECEnv environment (e.g. pettingzoo.mpe.simple_push_v2) + device (optional): the device on which tensors will be stored + ''' + + def __init__(self, zoo_env, name, device='cuda'): + env = zoo_env + env.reset() + self._env = env + self._name = name + self._device = device + self.agents = self._env.agents + self.subenvs = { + agent: SubEnv(agent, device, self.state_spaces[agent], self.action_spaces[agent]) + for agent in self.agents + } + + ''' + Reset the environment and return a new initial state. + + Returns: + An initial MultiagentState object. + ''' + + def reset(self): + self._env.reset() + return self.last() + + ''' + Reset the environment and return a new initial state. + + Args: + action (int): An int or tensor containing a single integer representing the action. + + Returns: + The MultiagentState object for the next agent + ''' + + def step(self, action): + if action is None: + self._env.step(action) + return + self._env.step(self._convert(action)) + return self.last() + + def seed(self, seed): + self._env.seed(seed) + + def render(self, mode='human'): + return self._env.render(mode=mode) + + def close(self): + self._env.close() + + def agent_iter(self): + return self._env.agent_iter() + + def is_done(self, agent): + return self._env.dones[agent] + + def duplicate(self, n): + return [MultiagentPettingZooEnv(cloudpickle.loads(cloudpickle.dumps(self._env)), self._name, device=self.device) for _ in range(n)] + + def last(self): + observation, reward, done, info = self._env.last() + selected_obs_space = self._env.observation_spaces[self._env.agent_selection] + return MultiagentState.from_zoo(self._env.agent_selection, (observation, reward, done, info), device=self._device, dtype=selected_obs_space.dtype) + + @property + def name(self): + return self._name + + @property + def device(self): + return self._device + + @property + def agent_selection(self): + return self._env.agent_selection + + @property + def state_spaces(self): + return self._env.observation_spaces + + @property + def observation_spaces(self): + return self._env.observation_spaces + + @property + def action_spaces(self): + return self._env.action_spaces + + def _convert(self, action): + agent = self._env.agent_selection + action_space = self._env.action_spaces[agent] + if torch.is_tensor(action): + if isinstance(action_space, gym.spaces.Discrete): + return action.item() + if isinstance(action_space, gym.spaces.Box): + return action.cpu().detach().numpy().reshape(-1) + raise TypeError("Unknown action space type") + return action + + +class SubEnv(): + def __init__(self, name, device, state_space, action_space): + self.name = name + self.device = device + self.state_space = state_space + self.action_space = action_space + + @property + def observation_space(self): + return self.state_space diff --git a/all/environments/multiagent_pettingzoo_test.py b/all/environments/multiagent_pettingzoo_test.py new file mode 100644 index 00000000..827119ad --- /dev/null +++ b/all/environments/multiagent_pettingzoo_test.py @@ -0,0 +1,94 @@ +import unittest +import torch +from all.environments import MultiagentPettingZooEnv +from pettingzoo.mpe import simple_world_comm_v2 + + +class MultiagentPettingZooEnvTest(unittest.TestCase): + def test_init(self): + self._make_env() + + def test_reset(self): + env = self._make_env() + state = env.reset() + self.assertEqual(state.observation.shape, (34,)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'leadadversary_0') + + def test_step(self): + env = self._make_env() + env.reset() + state = env.step(0) + self.assertEqual(state.observation.shape, (34,)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'adversary_0') + + def test_step_tensor(self): + env = self._make_env() + env.reset() + state = env.step(0) + self.assertEqual(state.observation.shape, (34,)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'adversary_0') + + def test_name(self): + env = self._make_env() + self.assertEqual(env.name, 'simple_world_comm_v2') + + def test_agent_iter(self): + env = self._make_env() + env.reset() + it = iter(env.agent_iter()) + self.assertEqual(next(it), 'leadadversary_0') + + def test_state_spaces(self): + state_spaces = self._make_env().state_spaces + self.assertEqual(state_spaces['leadadversary_0'].shape, (34,)) + self.assertEqual(state_spaces['adversary_0'].shape, (34,)) + + def test_action_spaces(self): + action_spaces = self._make_env().action_spaces + self.assertEqual(action_spaces['leadadversary_0'].n, 20) + self.assertEqual(action_spaces['adversary_0'].n, 5) + + def test_list_agents(self): + env = self._make_env() + self.assertEqual(env.agents, ['leadadversary_0', 'adversary_0', 'adversary_1', 'adversary_2', 'agent_0', 'agent_1']) + + def test_is_done(self): + env = self._make_env() + env.reset() + self.assertFalse(env.is_done('leadadversary_0')) + self.assertFalse(env.is_done('adversary_0')) + + def test_last(self): + env = self._make_env() + env.reset() + state = env.last() + self.assertEqual(state.observation.shape, (34,)) + self.assertEqual(state.reward, 0) + self.assertEqual(state.done, False) + self.assertEqual(state.mask, 1.) + self.assertEqual(state['agent'], 'leadadversary_0') + + def test_variable_spaces(self): + env = MultiagentPettingZooEnv(simple_world_comm_v2.env(), name="simple_world_comm_v2", device='cpu') + state = env.reset() + # tests that action spaces work + for agent in env.agents: + state = env.last() + self.assertTrue(env.observation_spaces[agent].contains(state['observation'].cpu().detach().numpy())) + env.step(env.action_spaces[env.agent_selection].sample()) + + def _make_env(self): + return MultiagentPettingZooEnv(simple_world_comm_v2.env(), name="simple_world_comm_v2", device='cpu') + + +if __name__ == "__main__": + unittest.main() diff --git a/all/environments/pybullet.py b/all/environments/pybullet.py new file mode 100644 index 00000000..a986e5b4 --- /dev/null +++ b/all/environments/pybullet.py @@ -0,0 +1,17 @@ +from .gym import GymEnvironment + + +class PybulletEnvironment(GymEnvironment): + short_names = { + "ant": "AntBulletEnv-v0", + "cheetah": "HalfCheetahBulletEnv-v0", + "humanoid": "HumanoidBulletEnv-v0", + "hopper": "HopperBulletEnv-v0", + "walker": "Walker2DBulletEnv-v0" + } + + def __init__(self, name, **kwargs): + import pybullet_envs + if name in self.short_names: + name = self.short_names[name] + super().__init__(name, **kwargs) diff --git a/all/environments/pybullet_test.py b/all/environments/pybullet_test.py new file mode 100644 index 00000000..d08e5bfb --- /dev/null +++ b/all/environments/pybullet_test.py @@ -0,0 +1,33 @@ +import unittest +from all.environments import PybulletEnvironment, GymEnvironment + + +class PybulletEnvironmentTest(unittest.TestCase): + def test_env_short_name(self): + for short_name, long_name in PybulletEnvironment.short_names.items(): + env = PybulletEnvironment(short_name) + self.assertEqual(env.name, long_name) + + def test_env_full_name(self): + env = PybulletEnvironment('HalfCheetahBulletEnv-v0') + self.assertEqual(env.name, 'HalfCheetahBulletEnv-v0') + + def test_reset(self): + env = PybulletEnvironment('cheetah') + state = env.reset() + self.assertEqual(state.observation.shape, (26,)) + self.assertEqual(state.reward, 0.) + self.assertFalse(state.done) + self.assertEqual(state.mask, 1) + + def test_step(self): + env = PybulletEnvironment('cheetah') + env.seed(0) + state = env.reset() + state = env.step(env.action_space.sample()) + self.assertEqual(state.observation.shape, (26,)) + self.assertGreater(state.reward, -1.) + self.assertLess(state.reward, 1) + self.assertNotEqual(state.reward, 0.) + self.assertFalse(state.done) + self.assertEqual(state.mask, 1) diff --git a/all/experiments/__init__.py b/all/experiments/__init__.py index 46a7e580..9b94ed6b 100644 --- a/all/experiments/__init__.py +++ b/all/experiments/__init__.py @@ -2,19 +2,22 @@ from .experiment import Experiment from .single_env_experiment import SingleEnvExperiment from .parallel_env_experiment import ParallelEnvExperiment +from .multiagent_env_experiment import MultiagentEnvExperiment from .writer import ExperimentWriter +from .writer import CometWriter from .plots import plot_returns_100 from .slurm import SlurmExperiment -from .watch import GreedyAgent, watch, load_and_watch +from .watch import watch, load_and_watch __all__ = [ "run_experiment", "Experiment", "SingleEnvExperiment", "ParallelEnvExperiment", + "MultiagentEnvExperiment", "SlurmExperiment", - "GreedyAgent", "ExperimentWriter", + "CometWriter", "watch", "load_and_watch", ] diff --git a/all/experiments/experiment.py b/all/experiments/experiment.py index 55ad40d5..b7837569 100644 --- a/all/experiments/experiment.py +++ b/all/experiments/experiment.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod import numpy as np +from scipy import stats +import torch class Experiment(ABC): @@ -11,22 +13,13 @@ class Experiment(ABC): quiet (bool): If False, the Experiment will print information about episode returns to standard out. ''' + def __init__(self, writer, quiet): self._writer = writer self._quiet = quiet self._best_returns = -np.inf self._returns100 = [] - @property - @abstractmethod - def frame(self): - '''The index of the current training frame.''' - - @property - @abstractmethod - def episode(self): - '''The index of the current training episode''' - @abstractmethod def train(self, frames=np.inf, episodes=np.inf): ''' @@ -45,12 +38,22 @@ def test(self, episodes=100): Test the agent in eval mode for a certain number of episodes. Args: - episodes (int): The number of test epsiodes. + episodes (int): The number of test episodes. Returns: list(float): A list of all returns received during testing. ''' + @property + @abstractmethod + def frame(self): + '''The index of the current training frame.''' + + @property + @abstractmethod + def episode(self): + '''The index of the current training episode''' + def _log_training_episode(self, returns, fps): if not self._quiet: print('episode: {}, frame: {}, fps: {}, returns: {}'.format(self.episode, self.frame, int(fps), returns)) @@ -72,4 +75,12 @@ def _log_test_episode(self, episode, returns): print('test episode: {}, returns: {}'.format(episode, returns)) def _log_test(self, returns): + if not self._quiet: + print('test returns (mean ± sem): {} ± {}'.format(np.mean(returns), stats.sem(returns))) self._writer.add_summary('returns-test', np.mean(returns), np.std(returns)) + + def save(self): + return self._preset.save('{}/preset.pt'.format(self._writer.log_dir)) + + def close(self): + self._writer.close() diff --git a/all/experiments/multiagent_env_experiment.py b/all/experiments/multiagent_env_experiment.py new file mode 100644 index 00000000..6bbacb1d --- /dev/null +++ b/all/experiments/multiagent_env_experiment.py @@ -0,0 +1,179 @@ +from timeit import default_timer as timer +import numpy as np +from scipy import stats +from .writer import ExperimentWriter, CometWriter +from .experiment import Experiment + + +class MultiagentEnvExperiment(): + ''' + An Experiment object for training and testing Multiagents. + + Args: + preset (all.presets.Preset): A Multiagent preset. + env (all.environments.MultiagentEnvironment): A multiagent environment. + log_dir (str, optional): The directory in which to save the logs and model. + name (str, optional): The name of the experiment. + quiet (bool, optional): Whether or not to print training information. + render (bool, optional): Whether or not to render during training. + save_freq (int, optional): How often to save the model to disk. + train_steps (int, optional): The number of steps for which to train. + write_loss (bool, optional): Whether or not to log advanced loss information. + ''' + + def __init__( + self, + preset, + env, + logdir='runs', + name=None, + quiet=False, + render=False, + save_freq=100, + train_steps=float('inf'), + write_loss=True, + writer="tensorboard" + ): + self._name = name if name is not None else preset.name + self._writer = self._make_writer(logdir, self._name, env.name, write_loss, writer) + self._agent = preset.agent(writer=self._writer, train_steps=train_steps) + self._env = env + self._episode = 0 + self._frame = 0 + self._logdir = logdir + self._preset = preset + self._quiet = quiet + self._render = render + self._save_freq = save_freq + + if render: + self._env.render() + + ''' + Train the Multiagent for a certain number of frames or episodes. + If both frames and episodes are specified, then the training loop will exit + when either condition is satisfied. + + Args: + frames (int): The maximum number of training frames. + episodes (bool): The maximum number of training episodes. + + Returns: + MultiagentEnvExperiment: The experiment object. + ''' + + def train(self, frames=np.inf, episodes=np.inf): + while not self._done(frames, episodes): + self._run_training_episode() + return self + + ''' + Test the agent in eval mode for a certain number of episodes. + + Args: + episodes (int): The number of test episodes. + + Returns: + list(float): A list of all returns received during testing. + ''' + + def test(self, episodes=100): + test_agent = self._preset.test_agent() + returns = {} + for episode in range(episodes): + episode_returns = self._run_test_episode(test_agent) + for agent, r in episode_returns.items(): + if agent in returns: + returns[agent].append(r) + else: + returns[agent] = [r] + self._log_test_episode(episode, episode_returns) + self._log_test(returns) + return returns + + '''int: The number of completed training frames''' + @property + def frame(self): + return self._frame + + '''int: The number of completed training episodes''' + @property + def episode(self): + return self._episode + + def _run_training_episode(self): + # initialize timer + start_time = timer() + start_frame = self._frame + + # initialize the episode + self._env.reset() + returns = {agent: 0 for agent in self._env.agents} + + for agent in self._env.agent_iter(): + if self._render: + self._env.render() + state = self._env.last() + returns[agent] += state.reward + action = self._agent.act(state) + if state.done: + self._env.step(None) + else: + self._env.step(action) + self._frame += 1 + + # stop the timer + end_time = timer() + fps = (self._frame - start_frame) / (end_time - start_time) + + # finalize the episode + self._log_training_episode(returns, fps) + self._save_model() + self._episode += 1 + + def _run_test_episode(self, test_agent): + self._env.reset() + returns = {agent: 0 for agent in self._env.agents} + + for agent in self._env.agent_iter(): + if self._render: + self._env.render() + state = self._env.last() + returns[agent] += state.reward + action = test_agent.act(state) + if state.done: + self._env.step(None) + else: + self._env.step(action) + self._frame += 1 + + return returns + + def _done(self, frames, episodes): + return self._frame > frames or self._episode > episodes + + def _log_training_episode(self, returns, fps): + if not self._quiet: + print('returns: {}'.format(returns)) + print('fps: {}'.format(fps)) + for agent in self._env.agents: + self._writer.add_evaluation('{}/returns/frame'.format(agent), returns[agent], step="frame") + + def _log_test_episode(self, episode, returns): + if not self._quiet: + print('test episode: {}, returns: {}'.format(episode, returns)) + + def _log_test(self, returns): + for agent, agent_returns in returns.items(): + if not self._quiet: + print('{} test returns (mean ± sem): {} ± {}'.format(agent, np.mean(agent_returns), stats.sem(agent_returns))) + self._writer.add_summary('{}/returns-test'.format(agent), np.mean(agent_returns), np.std(agent_returns)) + + def _save_model(self): + if self._save_freq != float('inf') and self._episode % self._save_freq == 0: + self._preset.save('{}/preset.pt'.format(self._writer.log_dir)) + + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + if writer == "comet": + return CometWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) + return ExperimentWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) diff --git a/all/experiments/multiagent_env_experiment_test.py b/all/experiments/multiagent_env_experiment_test.py new file mode 100644 index 00000000..0d8850f0 --- /dev/null +++ b/all/experiments/multiagent_env_experiment_test.py @@ -0,0 +1,110 @@ +import unittest +import numpy as np +import torch +from all.presets.atari import dqn +from all.presets import IndependentMultiagentPreset +from all.environments import MultiagentAtariEnv +from all.experiments import MultiagentEnvExperiment +from all.logging import Writer + + +class MockWriter(Writer): + def __init__(self, experiment, label, write_loss): + self.data = {} + self.label = label + self.write_loss = write_loss + self.experiment = experiment + + def add_scalar(self, key, value, step="frame"): + if key not in self.data: + self.data[key] = {"values": [], "steps": []} + self.data[key]["values"].append(value) + self.data[key]["steps"].append(self._get_step(step)) + + def add_loss(self, name, value, step="frame"): + pass + + def add_schedule(self, name, value, step="frame"): + pass + + def add_evaluation(self, name, value, step="frame"): + self.add_scalar("evaluation/" + name, value, self._get_step(step)) + + def add_summary(self, name, mean, std, step="frame"): + self.add_evaluation(name + "/mean", mean, step) + self.add_evaluation(name + "/std", std, step) + + def _get_step(self, _type): + if _type == "frame": + return self.experiment.frame + if _type == "episode": + return self.experiment.episode + return _type + + def close(self): + pass + + +class MockExperiment(MultiagentEnvExperiment): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + self._writer = MockWriter(self, agent_name + '_' + env_name, write_loss) + return self._writer + + +class TestMultiagentEnvExperiment(unittest.TestCase): + def setUp(self): + np.random.seed(0) + torch.manual_seed(0) + self.env = MultiagentAtariEnv('space_invaders_v1', device='cpu') + self.env.seed(0) + self.experiment = None + + def test_adds_default_name(self): + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, save_freq=float('inf')) + self.assertEqual(experiment._writer.label, "independent_space_invaders_v1") + + def test_adds_custom_name(self): + experiment = MockExperiment(self.make_preset(), self.env, name='custom', quiet=True, save_freq=float('inf')) + self.assertEqual(experiment._writer.label, "custom_space_invaders_v1") + + def test_writes_training_returns(self): + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, save_freq=float('inf')) + experiment.train(episodes=3) + self.assertEqual(experiment._writer.data, { + 'evaluation/first_0/returns/frame': {'values': [465.0, 235.0, 735.0, 415.0], 'steps': [766, 1524, 2440, 3038]}, + 'evaluation/second_0/returns/frame': {'values': [235.0, 465.0, 170.0, 295.0], 'steps': [766, 1524, 2440, 3038]} + }) + + def test_writes_test_returns(self): + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, save_freq=float('inf')) + experiment.train(episodes=3) + experiment._writer.data = {} + experiment.test(episodes=3) + self.assertEqual(list(experiment._writer.data.keys()), [ + 'evaluation/first_0/returns-test/mean', + 'evaluation/first_0/returns-test/std', + 'evaluation/second_0/returns-test/mean', + 'evaluation/second_0/returns-test/std' + ]) + steps = experiment._writer.data['evaluation/first_0/returns-test/mean']['steps'][0] + for datum in experiment._writer.data.values(): + self.assertEqual(len(datum['values']), 1) + self.assertGreaterEqual(datum['values'][0], 0.0) + self.assertEqual(len(datum['steps']), 1) + self.assertEqual(datum['steps'][0], steps) + + def test_writes_loss(self): + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, write_loss=True, save_freq=float('inf')) + self.assertTrue(experiment._writer.write_loss) + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, write_loss=False, save_freq=float('inf')) + self.assertFalse(experiment._writer.write_loss) + + def make_preset(self): + return IndependentMultiagentPreset('independent', 'cpu', { + agent: dqn.device('cpu').env(env).build() + for agent, env in self.env.subenvs.items() + }) + + +if __name__ == "__main__": + unittest.main() diff --git a/all/experiments/parallel_env_experiment.py b/all/experiments/parallel_env_experiment.py index 71b88a9f..eecc865d 100644 --- a/all/experiments/parallel_env_experiment.py +++ b/all/experiments/parallel_env_experiment.py @@ -3,25 +3,31 @@ import torch import numpy as np from all.core import State -from .writer import ExperimentWriter +from .writer import ExperimentWriter, CometWriter from .experiment import Experiment + class ParallelEnvExperiment(Experiment): '''An Experiment object for training and testing agents that use parallel training environments.''' + def __init__( self, - agent, + preset, env, + name=None, + train_steps=float('inf'), logdir='runs', quiet=False, render=False, - write_loss=True + write_loss=True, + writer="tensorboard" ): - super().__init__(self._make_writer(logdir, agent[0].__name__, env.name, write_loss), quiet) - make_agent, n_envs = agent - self._envs = env.duplicate(n_envs) - self._agent = make_agent(self._envs, self._writer) - self._n_envs = n_envs + self._name = name if name is not None else preset.name + super().__init__(self._make_writer(logdir, self._name, env.name, write_loss, writer), quiet) + self._n_envs = preset.n_envs + self._envs = env.duplicate(self._n_envs) + self._preset = preset + self._agent = preset.agent(writer=self._writer, train_steps=train_steps) self._render = render # training state @@ -55,11 +61,15 @@ def train(self, frames=np.inf, episodes=np.inf): self._step() def test(self, episodes=100): - self._test_reset(episodes) - while len(self._test_returns) < episodes: - self._test_step() - self._log_test(self._test_returns) - return self._test_returns + test_agent = self._preset.test_agent() + env = self._envs[0].duplicate(1)[0] + returns = [] + for episode in range(episodes): + episode_return = self._run_test_episode(test_agent, env) + returns.append(episode_return) + self._log_test_episode(episode, episode_return) + self._log_test(returns) + return returns def _reset(self): for env in self._envs: @@ -99,39 +109,6 @@ def _step_envs(self, actions): env.step(action) self._frame += 1 - def _test_reset(self, episodes): - self._reset() - self._test_episodes = episodes - self._test_episodes_started = 0 - self._test_returns = [] - self._should_save_returns = [True] * self._n_envs - - def _test_step(self): - states = self._aggregate_states() - actions = self._agent.eval(states) - self._test_step_envs(actions) - - def _test_step_envs(self, actions): - for i, env in enumerate(self._envs): - state = env.state - if self._render: - env.render() - if state.done: - self._returns[i] += state.reward - if self._should_save_returns[i]: - self._test_returns.append(self._returns[i].item()) - self._log_test_episode(len(self._test_returns), self._returns[i].item()) - if self._test_episodes_started > self._test_episodes: - self._should_save_returns[i] = False - env.reset() - self._returns[i] = 0 - self._test_episodes_started += 1 - else: - action = actions[i] - if action is not None: - self._returns[i] += state.reward - env.step(action) - def _aggregate_states(self): return State.array([env.state for env in self._envs]) @@ -142,9 +119,27 @@ def _aggregate_rewards(self): device=self._envs[0].device ) + def _run_test_episode(self, test_agent, env): + # initialize the episode + state = env.reset() + action = test_agent.act(state) + returns = 0 + + # loop until the episode is finished + while not state.done: + if self._render: + env.render() + state = env.step(action) + action = test_agent.act(state) + returns += state.reward + + return returns + def _fps(self, i): end_time = timer() return (self._frame - self._episode_start_frames[i]) / (end_time - self._episode_start_times[i]) - def _make_writer(self, logdir, agent_name, env_name, write_loss): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + if writer == "comet": + return CometWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) return ExperimentWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) diff --git a/all/experiments/parallel_env_experiment_test.py b/all/experiments/parallel_env_experiment_test.py index 5ceecf12..9f2d9816 100644 --- a/all/experiments/parallel_env_experiment_test.py +++ b/all/experiments/parallel_env_experiment_test.py @@ -6,24 +6,30 @@ from all.experiments import ParallelEnvExperiment from all.experiments.single_env_experiment_test import MockWriter -# pylint: disable=protected-access + class MockExperiment(ParallelEnvExperiment): - def _make_writer(self, logdir, agent_name, env_name, write_loss): - self._writer = MockWriter(self, agent_name + '_' + env_name, write_loss) + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + self._writer = MockWriter(self, agent_name + '_' + env_name, write_loss) return self._writer -class TestParalleleEnvExperiment(unittest.TestCase): +class TestParallelEnvExperiment(unittest.TestCase): def setUp(self): np.random.seed(0) torch.manual_seed(0) self.env = GymEnvironment('CartPole-v0') - self.experiment = MockExperiment(a2c(), self.env, quiet=True) + self.env.seed(0) + self.experiment = MockExperiment(self.make_agent(), self.env, quiet=True) for i, env in enumerate(self.experiment._envs): env.seed(i) - def test_adds_label(self): - self.assertEqual(self.experiment._writer.label, "_a2c_CartPole-v0") + def test_adds_default_label(self): + self.assertEqual(self.experiment._writer.label, "a2c_CartPole-v0") + + def test_adds_custom_label(self): + env = GymEnvironment('CartPole-v0') + experiment = MockExperiment(self.make_agent(), env, name='a2c', quiet=True) + self.assertEqual(experiment._writer.label, "a2c_CartPole-v0") def test_writes_training_returns_eps(self): self.experiment.train(episodes=3) @@ -39,27 +45,25 @@ def test_writes_training_returns_eps(self): def test_writes_test_returns(self): self.experiment.train(episodes=5) returns = self.experiment.test(episodes=4) - expected_mean = 9.5 - expected_std = 0.8660254037844386 - np.testing.assert_equal(np.mean(returns), expected_mean) + self.assertEqual(len(returns), 4) np.testing.assert_equal( self.experiment._writer.data["evaluation/returns-test/mean"]["values"], - np.array([expected_mean]), + np.array([np.mean(returns)]), ) np.testing.assert_equal( self.experiment._writer.data["evaluation/returns-test/std"]["values"], - np.array([expected_std]), - ) - np.testing.assert_equal( - self.experiment._writer.data["evaluation/returns-test/mean"]["steps"], - np.array([104.]), + np.array([np.std(returns)]), ) def test_writes_loss(self): - experiment = MockExperiment(a2c(), self.env, quiet=True, write_loss=True) + experiment = MockExperiment(self.make_agent(), self.env, quiet=True, write_loss=True) self.assertTrue(experiment._writer.write_loss) - experiment = MockExperiment(a2c(), self.env, quiet=True, write_loss=False) + experiment = MockExperiment(self.make_agent(), self.env, quiet=True, write_loss=False) self.assertFalse(experiment._writer.write_loss) + def make_agent(self): + return a2c.device('cpu').env(self.env).build() + + if __name__ == "__main__": unittest.main() diff --git a/all/experiments/plots.py b/all/experiments/plots.py index 58803667..a5f258fd 100644 --- a/all/experiments/plots.py +++ b/all/experiments/plots.py @@ -20,12 +20,12 @@ def load_returns_100_data(runs_dir): data = {} def add_data(agent, env, file): - if not env in data: + if env not in data: data[env] = {} data[env][agent] = np.genfromtxt(file, delimiter=",").reshape((-1, 3)) for agent_dir in os.listdir(runs_dir): - agent = agent_dir.split(" ")[0].strip("_") + agent = agent_dir.split("_")[0] agent_path = os.path.join(runs_dir, agent_dir) if os.path.isdir(agent_path): for env in os.listdir(agent_path): diff --git a/all/experiments/run_experiment.py b/all/experiments/run_experiment.py index 7b4cea18..132974e1 100644 --- a/all/experiments/run_experiment.py +++ b/all/experiments/run_experiment.py @@ -1,5 +1,6 @@ from .single_env_experiment import SingleEnvExperiment from .parallel_env_experiment import ParallelEnvExperiment +from all.presets import ParallelPreset def run_experiment( @@ -10,7 +11,8 @@ def run_experiment( quiet=False, render=False, test_episodes=100, - write_loss=True + write_loss=True, + writer="tensorboard" ): if not isinstance(agents, list): agents = [agents] @@ -19,25 +21,26 @@ def run_experiment( envs = [envs] for env in envs: - for agent in agents: - make_experiment = get_experiment_type(agent) + for preset_builder in agents: + preset = preset_builder.env(env).build() + make_experiment = get_experiment_type(preset) experiment = make_experiment( - agent, + preset, env, + train_steps=frames, logdir=logdir, quiet=quiet, render=render, - write_loss=write_loss + write_loss=write_loss, + writer=writer ) experiment.train(frames=frames) + experiment.save() experiment.test(episodes=test_episodes) + experiment.close() -def get_experiment_type(agent): - if is_parallel_env_agent(agent): +def get_experiment_type(preset): + if isinstance(preset, ParallelPreset): return ParallelEnvExperiment return SingleEnvExperiment - - -def is_parallel_env_agent(agent): - return isinstance(agent, tuple) diff --git a/all/experiments/single_env_experiment.py b/all/experiments/single_env_experiment.py index 8630fd05..8e6c619b 100644 --- a/all/experiments/single_env_experiment.py +++ b/all/experiments/single_env_experiment.py @@ -1,21 +1,30 @@ from timeit import default_timer as timer import numpy as np -from .writer import ExperimentWriter +from .writer import ExperimentWriter, CometWriter + from .experiment import Experiment + class SingleEnvExperiment(Experiment): '''An Experiment object for training and testing agents that interact with one environment at a time.''' + def __init__( self, - agent, + preset, env, + name=None, + train_steps=float('inf'), logdir='runs', quiet=False, render=False, - write_loss=True + write_loss=True, + writer="tensorboard" ): - super().__init__(self._make_writer(logdir, agent.__name__, env.name, write_loss), quiet) - self._agent = agent(env, self._writer) + self._name = name if name is not None else preset.name + super().__init__(self._make_writer(logdir, self._name, env.name, write_loss, writer), quiet) + self._logdir = logdir + self._preset = preset + self._agent = self._preset.agent(writer=self._writer, train_steps=train_steps) self._env = env self._render = render self._frame = 1 @@ -37,9 +46,10 @@ def train(self, frames=np.inf, episodes=np.inf): self._run_training_episode() def test(self, episodes=100): + test_agent = self._preset.test_agent() returns = [] for episode in range(episodes): - episode_return = self._run_test_episode() + episode_return = self._run_test_episode(test_agent) returns.append(episode_return) self._log_test_episode(episode, episode_return) self._log_test(returns) @@ -74,10 +84,10 @@ def _run_training_episode(self): # update experiment state self._episode += 1 - def _run_test_episode(self): + def _run_test_episode(self, test_agent): # initialize the episode state = self._env.reset() - action = self._agent.eval(state) + action = test_agent.act(state) returns = 0 # loop until the episode is finished @@ -85,7 +95,7 @@ def _run_test_episode(self): if self._render: self._env.render() state = self._env.step(action) - action = self._agent.eval(state) + action = test_agent.act(state) returns += state.reward return returns @@ -93,5 +103,7 @@ def _run_test_episode(self): def _done(self, frames, episodes): return self._frame > frames or self._episode > episodes - def _make_writer(self, logdir, agent_name, env_name, write_loss): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + if writer == "comet": + return CometWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) return ExperimentWriter(self, agent_name, env_name, loss=write_loss, logdir=logdir) diff --git a/all/experiments/single_env_experiment_test.py b/all/experiments/single_env_experiment_test.py index 05d48e4c..235847c3 100644 --- a/all/experiments/single_env_experiment_test.py +++ b/all/experiments/single_env_experiment_test.py @@ -7,7 +7,6 @@ from all.logging import Writer -# pylint: disable=protected-access class MockWriter(Writer): def __init__(self, experiment, label, write_loss): self.data = {} @@ -16,7 +15,7 @@ def __init__(self, experiment, label, write_loss): self.experiment = experiment def add_scalar(self, key, value, step="frame"): - if not key in self.data: + if key not in self.data: self.data[key] = {"values": [], "steps": []} self.data[key]["values"].append(value) self.data[key]["steps"].append(self._get_step(step)) @@ -41,10 +40,13 @@ def _get_step(self, _type): return self.experiment.episode return _type + def close(self): + pass + class MockExperiment(SingleEnvExperiment): - def _make_writer(self, logdir, agent_name, env_name, write_loss): - self._writer = MockWriter(self, agent_name + '_' + env_name, write_loss) + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + self._writer = MockWriter(self, agent_name + '_' + env_name, write_loss) return self._writer @@ -56,12 +58,16 @@ def setUp(self): self.env.seed(0) self.experiment = None - def test_adds_label(self): - experiment = MockExperiment(dqn(), self.env, quiet=True) - self.assertEqual(experiment._writer.label, "_dqn_CartPole-v0") + def test_adds_default_name(self): + experiment = MockExperiment(self.make_preset(), self.env, quiet=True) + self.assertEqual(experiment._writer.label, "dqn_CartPole-v0") + + def test_adds_custom_name(self): + experiment = MockExperiment(self.make_preset(), self.env, name='dqn', quiet=True) + self.assertEqual(experiment._writer.label, "dqn_CartPole-v0") def test_writes_training_returns_eps(self): - experiment = MockExperiment(dqn(), self.env, quiet=True) + experiment = MockExperiment(self.make_preset(), self.env, quiet=True) experiment.train(episodes=3) np.testing.assert_equal( experiment._writer.data["evaluation/returns/episode"]["values"], @@ -73,7 +79,7 @@ def test_writes_training_returns_eps(self): ) def test_writes_test_returns(self): - experiment = MockExperiment(dqn(), self.env, quiet=True) + experiment = MockExperiment(self.make_preset(), self.env, quiet=True) experiment.train(episodes=5) returns = experiment.test(episodes=4) expected_mean = 9.5 @@ -93,10 +99,14 @@ def test_writes_test_returns(self): ) def test_writes_loss(self): - experiment = MockExperiment(dqn(), self.env, quiet=True, write_loss=True) + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, write_loss=True) self.assertTrue(experiment._writer.write_loss) - experiment = MockExperiment(dqn(), self.env, quiet=True, write_loss=False) + experiment = MockExperiment(self.make_preset(), self.env, quiet=True, write_loss=False) self.assertFalse(experiment._writer.write_loss) + def make_preset(self): + return dqn.device('cpu').env(self.env).build() + + if __name__ == "__main__": unittest.main() diff --git a/all/experiments/slurm.py b/all/experiments/slurm.py index ef2f95ff..625bd757 100644 --- a/all/experiments/slurm.py +++ b/all/experiments/slurm.py @@ -4,8 +4,6 @@ import sys from .run_experiment import run_experiment -SCRIPT_NAME = 'experiment.sh' -OUT_DIR = 'out' # track the number of experiments created # in the current process @@ -13,6 +11,7 @@ "value": 1 } + class SlurmExperiment: def __init__( self, @@ -20,7 +19,11 @@ def __init__( envs, frames, test_episodes=100, + write_loss=False, job_name='autonomous-learning-library', + script_name='experiment.sh', + outdir='out', + logdir='runs', sbatch_args=None, ): if not isinstance(agents, list): @@ -33,7 +36,11 @@ def __init__( self.envs = envs self.frames = frames self.test_episodes = test_episodes + self.write_loss = write_loss self.job_name = job_name + self.script_name = script_name + self.outdir = outdir + self.logdir = logdir self.sbatch_args = sbatch_args or {} self.parse_args() @@ -60,7 +67,14 @@ def run_experiment(self): task_id = int(os.environ['SLURM_ARRAY_TASK_ID']) env = self.envs[int(task_id / len(self.agents))] agent = self.agents[task_id % len(self.agents)] - run_experiment(agent, env, self.frames, test_episodes=self.test_episodes, write_loss=False) + run_experiment( + agent, + env, + self.frames, + test_episodes=self.test_episodes, + logdir=self.logdir, + write_loss=self.write_loss + ) def queue_jobs(self): self.create_sbatch_script() @@ -68,14 +82,14 @@ def queue_jobs(self): self.run_sbatch_script() def create_sbatch_script(self): - script = open(SCRIPT_NAME, 'w') + script = open(self.script_name, 'w') script.write('#!/bin/sh\n\n') num_experiments = len(self.envs) * len(self.agents) sbatch_args = { 'job-name': self.job_name, - 'output': 'out/all_%A_%a.out', - 'error': 'out/all_%A_%a.err', + 'output': os.path.join(self.outdir, 'all_%A_%a.out'), + 'error': os.path.join(self.outdir, 'all_%A_%a.err'), 'array': '0-' + str(num_experiments - 1), 'partition': '1080ti-short', 'ntasks': 1, @@ -90,18 +104,18 @@ def create_sbatch_script(self): script.write('python ' + sys.argv[0] + ' --experiment_id ' + str(self._id) + '\n') script.close() - print('created sbatch script:', SCRIPT_NAME) + print('created sbatch script:', self.script_name) def make_output_directory(self): try: - os.mkdir(OUT_DIR) - print('Created output directory:', OUT_DIR) + os.mkdir(self.outdir) + print('Created output directory:', self.outdir) except FileExistsError: - print('Output directory already exists:', OUT_DIR) + print('Output directory already exists:', self.outdir) def run_sbatch_script(self): result = subprocess.run( - ['sbatch', SCRIPT_NAME], + ['sbatch', self.script_name], stdout=subprocess.PIPE, check=True ) diff --git a/all/experiments/watch.py b/all/experiments/watch.py index 2c40f9b7..5c0b1d03 100644 --- a/all/experiments/watch.py +++ b/all/experiments/watch.py @@ -4,95 +4,28 @@ import gym from all.agents import Agent + def watch(agent, env, fps=60): action = None returns = 0 # have to call this before initial reset for pybullet envs env.render(mode="human") + env.reset() + while True: + action = agent.act(env.state) + returns += env.state.reward + time.sleep(1 / fps) - if env.done: + if env.state.done: print('returns:', returns) env.reset() returns = 0 else: env.step(action) env.render() - action = agent.act(env.state, env.reward) - returns += env.reward - -def load_and_watch(dir, env, fps=60): - watch(GreedyAgent.load(dir, env), env, fps=fps) - -class GreedyAgent(Agent): - def __init__( - self, - action_space, - feature=None, - q=None, - policy=None - ): - self.action_space = action_space - self.feature = feature - self.policy = None - if policy: - self.policy = policy - else: - self.policy = q - if not self.policy: - raise TypeError('GreedyAgent must have either policy or q function') - - def act(self, state, _): - with torch.no_grad(): - if self.feature: - state = self.feature(state) - if isinstance(self.action_space, gym.spaces.Discrete): - return self.choose_discrete(state) - if isinstance(self.action_space, gym.spaces.Box): - return self.choose_continuous(state) - raise TypeError('Unknown action space') - - def eval(self, state, reward): - return self.act(state, reward) - - def choose_discrete(self, state): - ret = self.policy(state) - if isinstance(ret, torch.Tensor): - if len(ret.shape) == 3: # categorical dqn - return torch.argmax((ret * self.policy.atoms).sum(dim=2), dim=1) - return torch.argmax(self.policy(state), dim=1) - if isinstance(ret, torch.distributions.distribution.Distribution): - return ret.sample() - return ret # unknown type, return it and pray! - - def choose_continuous(self, state): - ret = self.policy(state) - if isinstance(ret, torch.Tensor): - return ret - if isinstance(ret, tuple): - return ret[0] - if isinstance(ret, torch.distributions.distribution.Distribution): - return ret.sample() - return ret # unknown type, return it and pray! - - @staticmethod - def load(dirname, env): - feature = None - policy = None - q = None - for filename in os.listdir(dirname): - if filename == 'feature.pt': - feature = torch.load(os.path.join(dirname, filename)).to(env.device) - if filename == 'policy.pt': - policy = torch.load(os.path.join(dirname, filename)).to(env.device) - if filename in ('q.pt', 'q_dist.pt'): - q = torch.load(os.path.join(dirname, filename)).to(env.device) - agent = GreedyAgent( - env.action_space, - feature=feature, - policy=policy, - q=q, - ) - return agent +def load_and_watch(filename, env, fps=60): + agent = torch.load(filename).test_agent() + watch(agent, env, fps=fps) diff --git a/all/experiments/writer.py b/all/experiments/writer.py index 888f145a..137d5f64 100644 --- a/all/experiments/writer.py +++ b/all/experiments/writer.py @@ -9,7 +9,7 @@ class ExperimentWriter(SummaryWriter, Writer): ''' - The Writer object used by all.experiments.Experiment. + The default Writer object used by all.experiments.Experiment. Writes logs using tensorboard into the current logdir directory ('runs' by default), tagging the run with a combination of the agent name, the commit hash of the current git repo of the working directory (if any), and the current time. @@ -20,17 +20,13 @@ class ExperimentWriter(SummaryWriter, Writer): env_name (str): The name of the environment the Experiment is being performed in loss (bool, optional): Whether or not to log loss/scheduling metrics, or only evaluation and summary metrics. ''' + def __init__(self, experiment, agent_name, env_name, loss=True, logdir='runs'): self.env_name = env_name - current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %f') - os.makedirs( - os.path.join( - logdir, ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)), env_name - ) - ) - self.log_dir = os.path.join( - logdir, ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)) - ) + current_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S_%f') + dir_name = "%s_%s_%s" % (agent_name, COMMIT_HASH, current_time) + os.makedirs(os.path.join(logdir, dir_name, env_name)) + self.log_dir = os.path.join(logdir, dir_name) self._experiment = experiment self._loss = loss super().__init__(log_dir=self.log_dir) @@ -70,6 +66,74 @@ def _get_step(self, _type): return self._experiment.episode return _type + def close(self): + pass + + +class CometWriter(Writer): + ''' + A Writer object to be used by all.experiments.Experiment. + Writes logs using comet.ml Requires an API key to be stored in .comet.config or as an environment variable. + Look at https://www.comet.ml/docs/python-sdk/advanced/#python-configuration for more info. + Args: + experiment (all.experiments.Experiment): The Experiment associated with the Writer object. + agent_name (str): The name of the Agent the Experiment is being performed on + env_name (str): The name of the environment the Experiment is being performed in + loss (bool, optional): Whether or not to log loss/scheduling metrics, or only evaluation and summary metrics. + logdir (str): The directory where run information is stored. + ''' + + def __init__(self, experiment, agent_name, env_name, loss=True, logdir='runs'): + self.env_name = env_name + self._experiment = experiment + self._loss = loss + + try: + from comet_ml import Experiment + except ImportError as e: + print("Failed to import comet_ml. CometWriter requires that comet_ml be installed") + raise e + try: + self._comet = Experiment(project_name=env_name) + except ImportError as e: + print("See https://www.comet.ml/docs/python-sdk/warnings-errors/ for more info on this error.") + raise e + except ValueError as e: + print("See https://www.comet.ml/docs/python-sdk/advanced/#python-configuration for more info on this error.") + raise e + + self._comet.set_name(agent_name) + self.log_dir = logdir + + def add_loss(self, name, value, step="frame"): + if self._loss: + self.add_evaluation("loss/" + name, value, step) + + def add_evaluation(self, name, value, step="frame"): + self._comet.log_metric(name, value, self._get_step(step)) + + def add_schedule(self, name, value, step="frame"): + if self._loss: + self.add_scalar(name, value, step) + + def add_summary(self, name, mean, std, step="frame"): + self.add_evaluation(name + "/mean", mean, step) + self.add_evaluation(name + "/std", std, step) + + def add_scalar(self, name, value, step="frame"): + self._comet.log_metric(name, value, self._get_step(step)) + + def _get_step(self, _type): + if _type == "frame": + return self._experiment.frame + if _type == "episode": + return self._experiment.episode + return _type + + def close(self): + self._comet.end() + + def get_commit_hash(): result = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], diff --git a/all/logging/__init__.py b/all/logging/__init__.py index f45fd280..f9387749 100644 --- a/all/logging/__init__.py +++ b/all/logging/__init__.py @@ -65,6 +65,12 @@ def add_summary(self, name, mean, std, step="frame"): step (str, optional): Which step to use (e.g., "frame" or "episode") ''' + @abstractmethod + def close(self): + ''' + Close the writer and perform any necessary cleanup. + ''' + class DummyWriter(Writer): '''A default Writer object that performs no logging and has no side effects.''' @@ -83,3 +89,6 @@ def add_schedule(self, name, value, step="frame"): def add_summary(self, name, mean, std, step="frame"): pass + + def close(self): + pass diff --git a/all/memory/advantage.py b/all/memory/advantage.py index 89cbb985..7ab42b20 100644 --- a/all/memory/advantage.py +++ b/all/memory/advantage.py @@ -1,6 +1,7 @@ import torch from all.core import State + class NStepAdvantageBuffer: def __init__(self, v, features, n_steps, n_envs, discount_factor=1): self.v = v diff --git a/all/memory/advantage_test.py b/all/memory/advantage_test.py index adf69e41..2223e4d4 100644 --- a/all/memory/advantage_test.py +++ b/all/memory/advantage_test.py @@ -41,7 +41,6 @@ def test_rollout(self): 1, 1, 1 ]) - self.assert_states_equal(states, expected_states) tt.assert_allclose(advantages, self._compute_expected_advantages( expected_states, expected_returns, expected_next_states, expected_lengths diff --git a/all/memory/generalized_advantage.py b/all/memory/generalized_advantage.py index ca588385..6ca6d447 100644 --- a/all/memory/generalized_advantage.py +++ b/all/memory/generalized_advantage.py @@ -1,7 +1,9 @@ import torch from all.core import State +from all.optim import Schedulable -class GeneralizedAdvantageBuffer: + +class GeneralizedAdvantageBuffer(Schedulable): def __init__( self, v, diff --git a/all/memory/replay_buffer.py b/all/memory/replay_buffer.py index 9da7c030..d60bad81 100644 --- a/all/memory/replay_buffer.py +++ b/all/memory/replay_buffer.py @@ -5,6 +5,7 @@ from all.optim import Schedulable from .segment_tree import SumSegmentTree, MinSegmentTree + class ReplayBuffer(ABC): @abstractmethod def store(self, state, action, reward, next_state): @@ -22,14 +23,19 @@ def update_priorities(self, indexes, td_errors): # Adapted from: # https://github.com/Shmuma/ptan/blob/master/ptan/experience.py class ExperienceReplayBuffer(ReplayBuffer): - def __init__(self, size, device=torch.device('cpu')): + def __init__(self, size, device='cpu', store_device=None): self.buffer = [] self.capacity = int(size) self.pos = 0 - self.device = device + self.device = torch.device(device) + if store_device is None: + store_device = self.device + self.store_device = torch.device(store_device) def store(self, state, action, next_state): if state is not None and not state.done: + state = state.to(self.store_device) + next_state = next_state.to(self.store_device) self._add((state, action, next_state)) def sample(self, batch_size): @@ -48,12 +54,12 @@ def _add(self, sample): self.pos = (self.pos + 1) % self.capacity def _reshape(self, minibatch, weights): - states = State.array([sample[0] for sample in minibatch]) + states = State.array([sample[0] for sample in minibatch]).to(self.device) if torch.is_tensor(minibatch[0][1]): - actions = torch.stack([sample[1] for sample in minibatch]) + actions = torch.stack([sample[1] for sample in minibatch]).to(self.device) else: actions = torch.tensor([sample[1] for sample in minibatch], device=self.device) - next_states = State.array([sample[2] for sample in minibatch]) + next_states = State.array([sample[2] for sample in minibatch]).to(self.device) return (states, actions, next_states.reward, next_states, weights) def __len__(self): @@ -62,6 +68,7 @@ def __len__(self): def __iter__(self): return iter(self.buffer) + class PrioritizedReplayBuffer(ExperienceReplayBuffer, Schedulable): def __init__( self, @@ -69,9 +76,10 @@ def __init__( alpha=0.6, beta=0.4, epsilon=1e-5, - device=torch.device('cpu') + device=torch.device('cpu'), + store_device=None ): - super().__init__(buffer_size, device=device) + super().__init__(buffer_size, device=device, store_device=store_device) assert alpha >= 0 self._alpha = alpha @@ -140,8 +148,10 @@ def _sample_proportional(self, batch_size): res.append(idx) return res + class NStepReplayBuffer(ReplayBuffer): '''Converts any ReplayBuffer into an NStepReplayBuffer''' + def __init__( self, steps, @@ -177,7 +187,7 @@ def store(self, state, action, next_state): def _store_next(self, next_state): self.buffer.store(self._states[0], self._actions[0], next_state.update('reward', self._reward)) - self._reward = self._reward - self._rewards[0] + self._reward = self._reward - self._rewards[0] self._reward *= self.discount_factor ** -1 del self._states[0] del self._actions[0] diff --git a/all/memory/replay_buffer_test.py b/all/memory/replay_buffer_test.py index f0b92da3..ccaa6202 100644 --- a/all/memory/replay_buffer_test.py +++ b/all/memory/replay_buffer_test.py @@ -12,13 +12,12 @@ class TestExperienceReplayBuffer(unittest.TestCase): - def setUp(self): + def test_run(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5) - def test_run(self): states = torch.arange(0, 20) actions = torch.arange(0, 20).view((-1, 1)) rewards = torch.arange(0, 20) @@ -51,6 +50,20 @@ def test_run(self): ) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights)) + def test_store_device(self): + if torch.cuda.is_available(): + self.replay_buffer = ExperienceReplayBuffer(5, device='cuda', store_device='cpu') + + states = torch.arange(0, 20).to('cuda') + actions = torch.arange(0, 20).view((-1, 1)).to('cuda') + rewards = torch.arange(0, 20).to('cuda') + state = State(states[0]) + next_state = State(states[1], reward=rewards[1]) + self.replay_buffer.store(state, actions[0], next_state) + sample = self.replay_buffer.sample(3) + self.assertEqual(sample[0].device, torch.device('cuda')) + self.assertEqual(self.replay_buffer.buffer[0][0].device, torch.device('cpu')) + class TestPrioritizedReplayBuffer(unittest.TestCase): def setUp(self): diff --git a/all/memory/segment_tree.py b/all/memory/segment_tree.py index 45f087a6..a2313a5d 100644 --- a/all/memory/segment_tree.py +++ b/all/memory/segment_tree.py @@ -1,8 +1,8 @@ -# pylint: skip-file # This entire module was stolen from: # https://github.com/Shmuma/ptan/blob/master/ptan/common/utils.py import operator + class SegmentTree(object): def __init__(self, capacity, operation, neutral_element): """Build a Segment Tree data structure. @@ -15,7 +15,7 @@ def __init__(self, capacity, operation, neutral_element): operation which reduces `operation` over a contiguous subsequence of items in the array. - Paramters + Parameters --------- capacity: int Total size of the array - must be a power of two. @@ -85,8 +85,9 @@ def __getitem__(self, idx): assert 0 <= idx < self._capacity return self._value[self._capacity + idx] -# stolen from https://github.com/Shmuma/ptan/blob/master/ptan/common/utils.py + class SumSegmentTree(SegmentTree): + # stolen from https://github.com/Shmuma/ptan/blob/master/ptan/common/utils.py def __init__(self, capacity): super(SumSegmentTree, self).__init__( capacity=capacity, @@ -107,7 +108,7 @@ def find_prefixsum_idx(self, prefixsum): Parameters ---------- perfixsum: float - upperbound on the sum of array prefix + upper bound on the sum of array prefix Returns ------- idx: int @@ -123,8 +124,9 @@ def find_prefixsum_idx(self, prefixsum): idx = 2 * idx + 1 return idx - self._capacity -# stolen from https://github.com/Shmuma/ptan/blob/master/ptan/common/utils.py + class MinSegmentTree(SegmentTree): + # stolen from https://github.com/Shmuma/ptan/blob/master/ptan/common/utils.py def __init__(self, capacity): super(MinSegmentTree, self).__init__( capacity=capacity, diff --git a/all/nn/__init__.py b/all/nn/__init__.py index 7fe4faec..d8d0ecd7 100644 --- a/all/nn/__init__.py +++ b/all/nn/__init__.py @@ -1,15 +1,20 @@ import torch from torch import nn -from torch.nn import * # export everthing +from torch.nn import * # noqa from torch.nn import functional as F import numpy as np from all.core import State +""""A Pytorch Module""" +Module = nn.Module + + class RLNetwork(nn.Module): """ Wraps a network such that States can be given as input. """ + def __init__(self, model, _=None): super().__init__() self.model = model @@ -18,6 +23,7 @@ def __init__(self, model, _=None): def forward(self, state): return state.apply(self.model, 'observation') + class Aggregation(nn.Module): """ Aggregation layer for the Dueling architecture. @@ -25,8 +31,8 @@ class Aggregation(nn.Module): https://arxiv.org/abs/1511.06581 This layer computes a Q function by combining an estimate of V with an estimate of the advantage. - The advantage is normalized by substracting the average - advantage so that we can propertly + The advantage is normalized by subtracting the average + advantage so that we can properly """ def forward(self, value, advantages): @@ -74,7 +80,7 @@ def forward(self, features): ).view((batch_size, -1)) -class Flatten(nn.Module): # pylint: disable=function-redefined +class Flatten(nn.Module): """ Flatten a tensor, e.g., between conv2d and linear layers. @@ -125,6 +131,7 @@ def forward(self, x): bias = bias + self.sigma_bias * self.epsilon_bias return F.linear(x, self.weight + self.sigma_weight * self.epsilon_weight, bias) + class NoisyFactorizedLinear(nn.Linear): """ NoisyNet layer with factorized gaussian noise @@ -168,6 +175,7 @@ def forward(self, input): noise_v = torch.mul(eps_in, eps_out) return F.linear(input, self.weight + self.sigma_weight * noise_v, bias) + class Linear0(nn.Linear): def reset_parameters(self): nn.init.constant_(self.weight, 0.0) @@ -197,16 +205,19 @@ def __init__(self, action_space): def forward(self, x): return torch.tanh(x) * self.weight + self.bias + def td_loss(loss): def _loss(estimates, errors): return loss(estimates, errors + estimates.detach()) return _loss + def weighted_mse_loss(input, target, weight, reduction='mean'): loss = (weight * ((target - input) ** 2)) return torch.mean(loss) if reduction == 'mean' else torch.sum(loss) + def weighted_smooth_l1_loss(input, target, weight, reduction='mean'): t = torch.abs(input - target) loss = torch.where(t < 1, 0.5 * t ** 2, t - 0.5) diff --git a/all/optim/scheduler.py b/all/optim/scheduler.py index 0167a6eb..a685a8d4 100644 --- a/all/optim/scheduler.py +++ b/all/optim/scheduler.py @@ -1,16 +1,20 @@ from all.logging import DummyWriter + class Schedulable: '''Allow "instance" descriptors to implement parameter scheduling.''' + def __getattribute__(self, name): value = object.__getattribute__(self, name) if isinstance(value, Scheduler): value = value.__get__(self, self.__class__) return value + class Scheduler: pass + class LinearScheduler(Scheduler): def __init__( self, diff --git a/all/optim/scheduler_test.py b/all/optim/scheduler_test.py index 844d30ee..3a66d237 100644 --- a/all/optim/scheduler_test.py +++ b/all/optim/scheduler_test.py @@ -2,10 +2,12 @@ import numpy as np from all.optim import Schedulable, LinearScheduler + class Obj(Schedulable): def __init__(self): self.attr = 0 + class TestScheduler(unittest.TestCase): def test_linear_scheduler(self): obj = Obj() @@ -14,5 +16,6 @@ def test_linear_scheduler(self): actual = [obj.attr for _ in expected] np.testing.assert_allclose(actual, expected) + if __name__ == '__main__': unittest.main() diff --git a/all/policies/deterministic.py b/all/policies/deterministic.py index af3aa7d8..a5b8991f 100644 --- a/all/policies/deterministic.py +++ b/all/policies/deterministic.py @@ -16,11 +16,12 @@ class DeterministicPolicy(Approximation): action_space (gym.spaces.Box): The Box representing the action space. kwargs (optional): Any other arguments accepted by all.approximation.Approximation ''' + def __init__( self, model, - optimizer, - space, + optimizer=None, + space=None, name='policy', **kwargs ): diff --git a/all/policies/deterministic_test.py b/all/policies/deterministic_test.py index 10c1a0a0..f018a0d1 100644 --- a/all/policies/deterministic_test.py +++ b/all/policies/deterministic_test.py @@ -11,6 +11,7 @@ STATE_DIM = 2 ACTION_DIM = 3 + class TestDeterministic(unittest.TestCase): def setUp(self): torch.manual_seed(2) @@ -79,5 +80,6 @@ def test_target(self): atol=1e-4, ) + if __name__ == '__main__': unittest.main() diff --git a/all/policies/gaussian.py b/all/policies/gaussian.py index d24acfdc..977a73b1 100644 --- a/all/policies/gaussian.py +++ b/all/policies/gaussian.py @@ -10,25 +10,27 @@ class GaussianPolicy(Approximation): A Gaussian stochastic policy. This policy will choose actions from a distribution represented by a spherical Gaussian. - The first n outputs the model will be squashed to [-1, 1] through a tanh function, and then - scaled to the given action_space, and the remaining n outputs will define the amount of noise added. + The first n outputs of the model are the mean of the distribution and the last n outputs are the log variance. + The output will be centered and scaled to the size of the given space, but the output will not be clipped. + For example, for an output range of [-1, 1], the center is 0 and the scale is 1. Args: model (torch.nn.Module): A Pytorch module representing the policy network. The input shape should be the same as the shape of the state (or feature) space, and the output shape should be double the size of the the action space. The first n outputs will be the unscaled mean of the action for each dimension, - and the second n outputs will be the logarithm of the variance. + and the last n outputs will be the logarithm of the variance. optimizer (torch.optim.Optimizer): A optimizer initialized with the model parameters, e.g. SGD, Adam, RMSprop, etc. action_space (gym.spaces.Box): The Box representing the action space. kwargs (optional): Any other arguments accepted by all.approximation.Approximation ''' + def __init__( self, model, - optimizer, - space, + optimizer=None, + space=None, name='policy', **kwargs ): @@ -39,6 +41,7 @@ def __init__( **kwargs ) + class GaussianPolicyNetwork(RLNetwork): def __init__(self, model, space): super().__init__(model) @@ -47,18 +50,11 @@ def __init__(self, model, space): def forward(self, state): outputs = super().forward(state) - action_dim = outputs.shape[1] // 2 - means = self._squash(outputs[:, 0:action_dim]) - - if not self.training: - return means - - logvars = outputs[:, action_dim:] * self._scale - std = logvars.exp_() - return Independent(Normal(means, std), 1) - - def _squash(self, x): - return torch.tanh(x) * self._scale + self._center + action_dim = outputs.shape[-1] // 2 + means = outputs[..., 0:action_dim] + logvars = outputs[..., action_dim:] + std = (0.5 * logvars).exp_() + return Independent(Normal(means + self._center, std * self._scale), 1) def to(self, device): self._center = self._center.to(device) diff --git a/all/policies/gaussian_test.py b/all/policies/gaussian_test.py index d33a9c2d..d514c01c 100644 --- a/all/policies/gaussian_test.py +++ b/all/policies/gaussian_test.py @@ -11,6 +11,7 @@ STATE_DIM = 2 ACTION_DIM = 3 + class TestGaussian(unittest.TestCase): def setUp(self): torch.manual_seed(2) @@ -59,10 +60,10 @@ def test_converge(self): def test_eval(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy.no_grad(state) - tt.assert_almost_equal(dist.mean, torch.tensor([[-0.233, 0.459, -0.058]]), decimal=3) - tt.assert_almost_equal(dist.entropy(), torch.tensor([4.251]), decimal=3) - best = self.policy.eval(state) - tt.assert_almost_equal(best, torch.tensor([[-0.233, 0.459, -0.058]]), decimal=3) + tt.assert_almost_equal(dist.mean, torch.tensor([[-0.237, 0.497, -0.058]]), decimal=3) + tt.assert_almost_equal(dist.entropy(), torch.tensor([4.254]), decimal=3) + best = self.policy.eval(state).sample() + tt.assert_almost_equal(best, torch.tensor([[-0.888, -0.887, 0.404]]), decimal=3) if __name__ == '__main__': diff --git a/all/policies/greedy.py b/all/policies/greedy.py index 7c2b1868..47a7b7f7 100644 --- a/all/policies/greedy.py +++ b/all/policies/greedy.py @@ -2,12 +2,13 @@ import torch from all.optim import Schedulable + class GreedyPolicy(Schedulable): ''' An "epsilon-greedy" action selection policy for discrete action spaces. This policy will usually choose the optimal action according to an approximation - of the action value function (the "q-function"), but with probabilty epsilon will + of the action value function (the "q-function"), but with probability epsilon will choose a random action instead. GreedyPolicy is a Schedulable, meaning that epsilon can be varied over time by passing a Scheduler object. @@ -16,6 +17,7 @@ class GreedyPolicy(Schedulable): num_actions (int): The number of available actions. epsilon (float, optional): The probability of selecting a random action. ''' + def __init__( self, q, @@ -39,12 +41,13 @@ def no_grad(self, state): def eval(self, state): return torch.argmax(self.q.eval(state)).item() + class ParallelGreedyPolicy(Schedulable): ''' A parallel version of the "epsilon-greedy" action selection policy for discrete action spaces. This policy will usually choose the optimal action according to an approximation - of the action value function (the "q-function"), but with probabilty epsilon will + of the action value function (the "q-function"), but with probability epsilon will choose a random action instead. GreedyPolicy is a Schedulable, meaning that epsilon can be varied over time by passing a Scheduler object. @@ -53,6 +56,7 @@ class ParallelGreedyPolicy(Schedulable): num_actions (int): The number of available actions. epsilon (float, optional): The probability of selecting a random action. ''' + def __init__( self, q, @@ -66,13 +70,13 @@ def __init__( def __call__(self, state): best_actions = torch.argmax(self.q(state), dim=-1) random_actions = torch.randint(0, self.n_actions, best_actions.shape, device=best_actions.device) - choices = (torch.randn_like(best_actions) < self.epsilon).int() + choices = (torch.rand(best_actions.shape, device=best_actions.device) < self.epsilon).int() return choices * random_actions + (1 - choices) * best_actions def no_grad(self, state): best_actions = torch.argmax(self.q.no_grad(state), dim=-1) random_actions = torch.randint(0, self.num_actions, best_actions.shape, device=best_actions.device) - choices = (torch.randn(best_actions.shape, device=best_actions.device) < self.epsilon).int() + choices = (torch.rand(best_actions.shape, device=best_actions.device) < self.epsilon).int() return choices * random_actions + (1 - choices) * best_actions def eval(self, state): diff --git a/all/policies/soft_deterministic.py b/all/policies/soft_deterministic.py index 0a20da4c..150618e2 100644 --- a/all/policies/soft_deterministic.py +++ b/all/policies/soft_deterministic.py @@ -2,6 +2,7 @@ from all.approximation import Approximation from all.nn import RLNetwork + class SoftDeterministicPolicy(Approximation): ''' A "soft" deterministic policy compatible with soft actor-critic (SAC). @@ -17,11 +18,12 @@ class SoftDeterministicPolicy(Approximation): action_space (gym.spaces.Box): The Box representing the action space. kwargs (optional): Any other arguments accepted by all.approximation.Approximation ''' + def __init__( self, model, - optimizer, - space, + optimizer=None, + space=None, name="policy", **kwargs ): diff --git a/all/policies/soft_deterministic_test.py b/all/policies/soft_deterministic_test.py index 27ee7cff..135bd692 100644 --- a/all/policies/soft_deterministic_test.py +++ b/all/policies/soft_deterministic_test.py @@ -11,6 +11,7 @@ STATE_DIM = 2 ACTION_DIM = 3 + class TestSoftDeterministic(unittest.TestCase): def setUp(self): torch.manual_seed(2) @@ -66,5 +67,6 @@ def test_scaling(self): tt.assert_allclose(action, torch.tensor([[-3.09055, -4.752777, 188.98222]])) tt.assert_allclose(log_prob, torch.tensor([-0.397002]), rtol=1e-4) + if __name__ == '__main__': unittest.main() diff --git a/all/policies/softmax.py b/all/policies/softmax.py index 1502d8e2..431333fe 100644 --- a/all/policies/softmax.py +++ b/all/policies/softmax.py @@ -16,10 +16,11 @@ class SoftmaxPolicy(Approximation): model parameters, e.g. SGD, Adam, RMSprop, etc. kwargs (optional): Any other arguments accepted by all.approximation.Approximation ''' + def __init__( self, model, - optimizer, + optimizer=None, name='policy', **kwargs ): @@ -34,6 +35,4 @@ def __init__(self, model): def forward(self, state): outputs = super().forward(state) probs = functional.softmax(outputs, dim=-1) - if self.training: - return torch.distributions.Categorical(probs) - return torch.argmax(probs, dim=-1) + return torch.distributions.Categorical(probs) diff --git a/all/policies/softmax_test.py b/all/policies/softmax_test.py index 0f3c28c3..6584b44c 100644 --- a/all/policies/softmax_test.py +++ b/all/policies/softmax_test.py @@ -8,6 +8,7 @@ STATE_DIM = 2 ACTIONS = 3 + class TestSoftmax(unittest.TestCase): def setUp(self): torch.manual_seed(2) @@ -78,7 +79,7 @@ def test_eval(self): [0.266, 0.196, 0.538], [0.469, 0.227, 0.304] ]), decimal=3) - best = self.policy.eval(states) + best = self.policy.eval(states).sample() tt.assert_equal(best, torch.tensor([2, 2, 0])) diff --git a/all/presets/.DS_Store b/all/presets/.DS_Store new file mode 100644 index 00000000..1020c62e Binary files /dev/null and b/all/presets/.DS_Store differ diff --git a/all/presets/__init__.py b/all/presets/__init__.py index aa9c5b84..82d875eb 100644 --- a/all/presets/__init__.py +++ b/all/presets/__init__.py @@ -1,5 +1,17 @@ -import all.presets.atari -import all.presets.classic_control -import all.presets.continuous +from all.presets import atari +from all.presets import classic_control +from all.presets import continuous +from .preset import Preset, ParallelPreset +from .builder import PresetBuilder, ParallelPresetBuilder +from .independent_multiagent import IndependentMultiagentPreset -__all__ = ["atari", "classic_control", "continuous"] +__all__ = [ + "Preset", + "ParallelPreset", + "PresetBuilder", + "ParallelPresetBuilder", + "atari", + "classic_control", + "continuous", + "IndependentMultiagentPreset" +] diff --git a/all/presets/atari/.DS_Store b/all/presets/atari/.DS_Store new file mode 100644 index 00000000..a0783859 Binary files /dev/null and b/all/presets/atari/.DS_Store differ diff --git a/all/presets/atari/__init__.py b/all/presets/atari/__init__.py index a14efce7..836b5657 100644 --- a/all/presets/atari/__init__.py +++ b/all/presets/atari/__init__.py @@ -1,13 +1,14 @@ -from .a2c import a2c -from .c51 import c51 -from .dqn import dqn -from .ddqn import ddqn -from .ppo import ppo -from .rainbow import rainbow -from .vac import vac -from .vpg import vpg -from .vqn import vqn -from .vsarsa import vsarsa +from .a2c import a2c, A2CAtariPreset +from .c51 import c51, C51AtariPreset +from .dqn import dqn, DQNAtariPreset +from .ddqn import ddqn, DDQNAtariPreset +from .ppo import ppo, PPOAtariPreset +from .rainbow import rainbow, RainbowAtariPreset +from .vac import vac, VACAtariPreset +from .vpg import vpg, VPGAtariPreset +from .vqn import vqn, VQNAtariPreset +from .vsarsa import vsarsa, VSarsaAtariPreset + __all__ = [ "a2c", diff --git a/all/presets/atari/a2c.py b/all/presets/atari/a2c.py index cdd71451..2b751d4c 100644 --- a/all/presets/atari/a2c.py +++ b/all/presets/atari/a2c.py @@ -1,40 +1,48 @@ +import copy +import math from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import A2C +from all.agents import A2C, A2CTestAgent from all.bodies import DeepmindAtariBody from all.approximation import VNetwork, FeatureNetwork from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import nature_features, nature_value_head, nature_policy_head - - -def a2c( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=7e-4, - eps=1.5e-4, - # Other optimization settings - clip_grad=0.1, - entropy_loss_scaling=0.01, - value_loss_scaling=0.5, - # Batch settings - n_envs=16, - n_steps=5, - # Model construction - feature_model_constructor=nature_features, - value_model_constructor=nature_value_head, - policy_model_constructor=nature_policy_head -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.atari.models import nature_features, nature_value_head, nature_policy_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 7e-4, + "eps": 1e-3, + # Other optimization settings + "clip_grad": 0.5, + "entropy_loss_scaling": 0.01, + "value_loss_scaling": 0.5, + # Batch settings + "n_envs": 16, + "n_steps": 5, + # Model construction + "feature_model_constructor": nature_features, + "value_model_constructor": nature_value_head, + "policy_model_constructor": nature_policy_head +} + + +class A2CAtariPreset(ParallelPreset): """ - A2C Atari preset. + Advantage Actor-Critic (A2C) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. clip_grad (float): The maximum magnitude of the gradient for any given parameter. @@ -47,47 +55,42 @@ def a2c( value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _a2c(envs, writer=DummyWriter()): - env = envs[0] - final_anneal_step = last_frame / (n_steps * n_envs * 4) - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_model = feature_model_constructor().to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor']().to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / (self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) - feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) - value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( - feature_model, + self.feature_model, feature_optimizer, - scheduler=CosineAnnealingLR( - feature_optimizer, - final_anneal_step, - ), - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(feature_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, - scheduler=CosineAnnealingLR( - value_optimizer, - final_anneal_step, - ), - loss_scaling=value_loss_scaling, - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(value_optimizer, n_updates), + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - scheduler=CosineAnnealingLR( - policy_optimizer, - final_anneal_step, - ), - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(policy_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) @@ -96,15 +99,18 @@ def _a2c(envs, writer=DummyWriter()): features, v, policy, - n_envs=n_envs, - n_steps=n_steps, - discount_factor=discount_factor, - entropy_loss_scaling=entropy_loss_scaling, + n_envs=self.hyperparameters["n_envs"], + n_steps=self.hyperparameters["n_steps"], + discount_factor=self.hyperparameters["discount_factor"], + entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer ), ) - return _a2c, n_envs + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return DeepmindAtariBody(A2CTestAgent(features, policy)) -__all__ = ["a2c"] +a2c = ParallelPresetBuilder('a2c', default_hyperparameters, A2CAtariPreset) diff --git a/all/presets/atari/c51.py b/all/presets/atari/c51.py index d09884b8..c21c3b0c 100644 --- a/all/presets/atari/c51.py +++ b/all/presets/atari/c51.py @@ -1,46 +1,54 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QDist, FixedTarget -from all.agents import C51 +from all.agents import C51, C51TestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.memory import ExperienceReplayBuffer from all.optim import LinearScheduler -from .models import nature_c51 +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.atari.models import nature_c51 -def c51( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=1e-4, - eps=1.5e-4, - # Training settings - minibatch_size=32, - update_frequency=4, - target_update_frequency=1000, - # Replay buffer settings - replay_start_size=80000, - replay_buffer_size=1000000, - # Explicit exploration - initial_exploration=0.02, - final_exploration=0., - # Distributional RL - atoms=51, - v_min=-10, - v_max=10, - # Model construction - model_constructor=nature_c51 -): +default_hyperparameters = { + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-4, + "eps": 1.5e-4, + # Training settings + "minibatch_size": 32, + "update_frequency": 4, + "target_update_frequency": 1000, + # Replay buffer settings + "replay_start_size": 80000, + "replay_buffer_size": 1000000, + # Explicit exploration + "initial_exploration": 0.02, + "final_exploration": 0., + "final_exploration_step": 250000, + "test_exploration": 0.001, + # Distributional RL + "atoms": 51, + "v_min": -10, + "v_max": 10, + # Model construction + "model_constructor": nature_c51 +} + + +class C51AtariPreset(Preset): """ - C51 Atari preset. + Categorical DQN (C51) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. @@ -48,59 +56,81 @@ def c51( target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, + initial_exploration (float): Initial probability of choosing a random action, decayed over course of training. - final_exploration (int): Final probability of choosing a random action. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent atoms (int): The number of atoms in the categorical distribution used to represent the distributional value function. v_min (int): The expected return corresponding to the smallest atom. - v_max (int): The expected return correspodning to the larget atom. + v_max (int): The expected return corresponding to the largest atom. model_constructor (function): The function used to construct the neural model. """ - def _c51(env, writer=DummyWriter()): - action_repeat = 4 - last_timestep = last_frame / action_repeat - last_update = (last_timestep - replay_start_size) / update_frequency - model = model_constructor(env, atoms=atoms).to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env, atoms=hyperparameters['atoms']).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] + optimizer = Adam( - model.parameters(), - lr=lr, - eps=eps + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] ) + q = QDist( - model, + self.model, optimizer, - env.action_space.n, - atoms, - v_min=v_min, - v_max=v_max, - target=FixedTarget(target_update_frequency), - scheduler=CosineAnnealingLR(optimizer, last_update), + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + target=FixedTarget(self.hyperparameters['target_update_frequency']), + scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer, ) + replay_buffer = ExperienceReplayBuffer( - replay_buffer_size, - device=device + self.hyperparameters['replay_buffer_size'], + device=self.device ) + return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( - initial_exploration, - final_exploration, + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], 0, - last_timestep, + self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), - discount_factor=discount_factor, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, + discount_factor=self.hyperparameters["discount_factor"], + minibatch_size=self.hyperparameters["minibatch_size"], + replay_start_size=self.hyperparameters["replay_start_size"], + update_frequency=self.hyperparameters["update_frequency"], writer=writer ), - lazy_frames=True + lazy_frames=True, + episodic_lives=True + ) + + def test_agent(self): + q_dist = QDist( + copy.deepcopy(self.model), + None, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], ) - return _c51 + return DeepmindAtariBody(C51TestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"])) + + +c51 = PresetBuilder('c51', default_hyperparameters, C51AtariPreset) diff --git a/all/presets/atari/ddqn.py b/all/presets/atari/ddqn.py index fb23027a..ceb44bcd 100644 --- a/all/presets/atari/ddqn.py +++ b/all/presets/atari/ddqn.py @@ -1,47 +1,55 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QNetwork, FixedTarget -from all.agents import DDQN +from all.agents import DDQN, DDQNTestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.memory import PrioritizedReplayBuffer from all.nn import weighted_smooth_l1_loss from all.optim import LinearScheduler from all.policies import GreedyPolicy -from .models import nature_ddqn - -def ddqn( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=1e-4, - eps=1.5e-4, - # Training settings - minibatch_size=32, - update_frequency=4, - target_update_frequency=1000, - # Replay buffer settings - replay_start_size=80000, - replay_buffer_size=1000000, - # Explicit exploration - initial_exploration=1., - final_exploration=0.01, - final_exploration_frame=4000000, - # Prioritized replay settings - alpha=0.5, - beta=0.5, - # Model construction - model_constructor=nature_ddqn -): +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.atari.models import nature_ddqn + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-4, + "eps": 1.5e-4, + # Training settings + "minibatch_size": 32, + "update_frequency": 4, + "target_update_frequency": 1000, + # Replay buffer settings + "replay_start_size": 80000, + "replay_buffer_size": 1000000, + "alpha": 0.5, + "beta": 0.5, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0.01, + "final_exploration_step": 250000, + "test_exploration": 0.001, + # Model construction + "model_constructor": nature_ddqn +} + + +class DDQNAtariPreset(Preset): """ - Dueling Double DQN with Prioritized Experience Replay (PER). + Dueling Double DQN (DDQN) with Prioritized Experience Replay (PER) Atari Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. @@ -49,61 +57,76 @@ def ddqn( target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent alpha (float): Amount of prioritization in the prioritized experience replay buffer. (0 = no prioritization, 1 = full prioritization) beta (float): The strength of the importance sampling correction for prioritized experience replay. (0 = no correction, 1 = full correction) model_constructor (function): The function used to construct the neural model. """ - def _ddqn(env, writer=DummyWriter()): - action_repeat = 4 - last_timestep = last_frame / action_repeat - last_update = (last_timestep - replay_start_size) / update_frequency - final_exploration_step = final_exploration_frame / action_repeat - model = model_constructor(env).to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] + optimizer = Adam( - model.parameters(), - lr=lr, - eps=eps + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] ) + q = QNetwork( - model, + self.model, optimizer, - scheduler=CosineAnnealingLR(optimizer, last_update), - target=FixedTarget(target_update_frequency), + scheduler=CosineAnnealingLR(optimizer, n_updates), + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) + policy = GreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, - replay_start_size, - final_exploration_step - replay_start_size, - name="epsilon", + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + self.hyperparameters['replay_start_size'], + self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], + name="exploration", writer=writer ) ) + replay_buffer = PrioritizedReplayBuffer( - replay_buffer_size, - alpha=alpha, - beta=beta, - device=device + self.hyperparameters['replay_buffer_size'], + alpha=self.hyperparameters['alpha'], + beta=self.hyperparameters['beta'], + device=self.device ) + return DeepmindAtariBody( DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, - discount_factor=discount_factor, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, - ), + discount_factor=self.hyperparameters["discount_factor"], + minibatch_size=self.hyperparameters["minibatch_size"], + replay_start_size=self.hyperparameters["replay_start_size"], + update_frequency=self.hyperparameters["update_frequency"], + ), lazy_frames=True ) - return _ddqn + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DeepmindAtariBody( + DDQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + ) + + +ddqn = PresetBuilder('ddqn', default_hyperparameters, DDQNAtariPreset) diff --git a/all/presets/atari/dqn.py b/all/presets/atari/dqn.py index 53b1209a..fef4382f 100644 --- a/all/presets/atari/dqn.py +++ b/all/presets/atari/dqn.py @@ -1,45 +1,56 @@ +import copy +import torch +import numpy as np from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR from torch.nn.functional import smooth_l1_loss +from all import nn from all.approximation import QNetwork, FixedTarget -from all.agents import DQN +from all.agents import Agent, DQN, DQNTestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.memory import ExperienceReplayBuffer from all.optim import LinearScheduler from all.policies import GreedyPolicy -from .models import nature_dqn - - -def dqn( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=1e-4, - eps=1.5e-4, - # Training settings - minibatch_size=32, - update_frequency=4, - target_update_frequency=1000, - # Replay buffer settings - replay_start_size=80000, - replay_buffer_size=1000000, - # Explicit exploration - initial_exploration=1., - final_exploration=0.01, - final_exploration_frame=4000000, - # Model construction - model_constructor=nature_dqn -): +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.atari.models import nature_dqn + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-4, + "eps": 1.5e-4, + # Training settings + "minibatch_size": 32, + "update_frequency": 4, + "target_update_frequency": 1000, + # Replay buffer settings + "replay_start_size": 80000, + "replay_buffer_size": 1000000, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0.01, + "final_exploration_step": 250000, + "test_exploration": 0.001, + # Model construction + "model_constructor": nature_dqn +} + + +class DQNAtariPreset(Preset): """ - DQN Atari preset. + Deep Q-Network (DQN) Atari Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. - discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: + discount_factor (float, optional): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. @@ -47,50 +58,53 @@ def dqn( target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent model_constructor (function): The function used to construct the neural model. """ - def _dqn(env, writer=DummyWriter()): - action_repeat = 4 - last_timestep = last_frame / action_repeat - last_update = (last_timestep - replay_start_size) / update_frequency - final_exploration_step = final_exploration_frame / action_repeat - model = model_constructor(env).to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + hyperparameters = {**default_hyperparameters, **hyperparameters} + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( - model.parameters(), - lr=lr, - eps=eps + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] ) q = QNetwork( - model, + self.model, optimizer, - scheduler=CosineAnnealingLR(optimizer, last_update), - target=FixedTarget(target_update_frequency), + scheduler=CosineAnnealingLR(optimizer, n_updates), + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, - replay_start_size, - final_exploration_step - replay_start_size, - name="epsilon", + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + self.hyperparameters['replay_start_size'], + self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], + name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( - replay_buffer_size, - device=device + self.hyperparameters['replay_buffer_size'], + device=self.device ) return DeepmindAtariBody( @@ -98,12 +112,20 @@ def _dqn(env, writer=DummyWriter()): q, policy, replay_buffer, - discount_factor=discount_factor, + discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, + minibatch_size=self.hyperparameters['minibatch_size'], + replay_start_size=self.hyperparameters['replay_start_size'], + update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True ) - return _dqn + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DeepmindAtariBody( + DQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + ) + + +dqn = PresetBuilder('dqn', default_hyperparameters, DQNAtariPreset) diff --git a/all/presets/atari/models/__init__.py b/all/presets/atari/models/__init__.py index bdef90a3..c2627408 100644 --- a/all/presets/atari/models/__init__.py +++ b/all/presets/atari/models/__init__.py @@ -1,9 +1,9 @@ -import numpy as np from all import nn + def nature_dqn(env, frames=4): return nn.Sequential( - nn.Scale(1/255), + nn.Scale(1 / 255), nn.Conv2d(frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), @@ -16,9 +16,10 @@ def nature_dqn(env, frames=4): nn.Linear0(512, env.action_space.n) ) + def nature_ddqn(env, frames=4): return nn.Sequential( - nn.Scale(1/255), + nn.Scale(1 / 255), nn.Conv2d(frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), @@ -40,9 +41,10 @@ def nature_ddqn(env, frames=4): ) ) + def nature_features(frames=4): return nn.Sequential( - nn.Scale(1/255), + nn.Scale(1 / 255), nn.Conv2d(frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), @@ -54,15 +56,18 @@ def nature_features(frames=4): nn.ReLU(), ) + def nature_value_head(): return nn.Linear(512, 1) + def nature_policy_head(env): return nn.Linear0(512, env.action_space.n) + def nature_c51(env, frames=4, atoms=51): return nn.Sequential( - nn.Scale(1/255), + nn.Scale(1 / 255), nn.Conv2d(frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), @@ -75,9 +80,10 @@ def nature_c51(env, frames=4, atoms=51): nn.Linear0(512, env.action_space.n * atoms) ) + def nature_rainbow(env, frames=4, hidden=512, atoms=51, sigma=0.5): return nn.Sequential( - nn.Scale(1/255), + nn.Scale(1 / 255), nn.Conv2d(frames, 32, 8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, 4, stride=2), diff --git a/all/presets/atari/models/test_.py b/all/presets/atari/models/test_.py index 7f4b0cc4..68b1360d 100644 --- a/all/presets/atari/models/test_.py +++ b/all/presets/atari/models/test_.py @@ -20,32 +20,32 @@ def test_rainbow_model_cpu(self): tt.assert_almost_equal( out, torch.tensor([[ - 0.0676, -0.0235, 0.0690, -0.0713, -0.0287, 0.0053, -0.0463, 0.0495, - -0.0222, -0.0504, 0.0064, -0.0204, 0.0168, 0.0127, -0.0113, -0.0586, - -0.0544, 0.0114, -0.0077, 0.0666, -0.0663, -0.0420, -0.0698, -0.0314, - 0.0272, 0.0361, -0.0537, 0.0301, 0.0036, -0.0472, -0.0499, 0.0114, - 0.0182, 0.0008, -0.0132, -0.0803, -0.0087, -0.0017, 0.0598, -0.0627, - 0.0859, 0.0117, 0.0105, 0.0309, -0.0370, -0.0111, -0.0262, 0.0338, - 0.0141, -0.0385, 0.0547, 0.0648, -0.0370, 0.0107, -0.0629, -0.0163, - 0.0282, -0.0670, 0.0161, -0.0244, -0.0030, 0.0038, -0.0208, 0.0005, - 0.0125, 0.0608, -0.0089, 0.0026, 0.0562, -0.0678, 0.0841, -0.0265, - -0.0461, -0.0124, 0.0276, 0.0364, 0.0195, -0.0309, -0.0337, -0.0603, - -0.0252, -0.0356, 0.0221, 0.0184, -0.0154, -0.0136, -0.0277, 0.0283, - 0.0495, 0.0185, -0.0357, 0.0305, -0.0052, -0.0432, -0.0135, -0.0554, - -0.0094, 0.0272, 0.1030, 0.0049, 0.0012, -0.0140, 0.0146, -0.0979, - 0.0487, 0.0122, -0.0204, 0.0496, -0.0055, -0.0015, -0.0170, 0.0053, - 0.0104, -0.0742, 0.0742, -0.0381, 0.0104, -0.0065, -0.0564, 0.0453, - -0.0057, -0.0029, -0.0722, 0.0094, -0.0561, 0.0284, 0.0402, 0.0233, - -0.0716, -0.0424, 0.0165, -0.0505, 0.0006, 0.0219, -0.0601, 0.0656, - -0.0175, -0.0524, 0.0355, 0.0007, -0.0042, -0.0443, 0.0871, -0.0403, - -0.0031, 0.0171, -0.0359, -0.0520, -0.0344, 0.0239, 0.0099, 0.0004, - 0.0235, 0.0238, -0.0153, 0.0501, -0.0052, 0.0162, 0.0313, -0.0121, - 0.0009, -0.0366, -0.0628, 0.0386, -0.0671, 0.0480, -0.0595, 0.0568, - -0.0604, -0.0540, 0.0403, -0.0187, 0.0649, 0.0029, -0.0003, 0.0020, - -0.0056, 0.0471, -0.0145, -0.0126, -0.0395, -0.0455, -0.0437, 0.0056, - 0.0331, 0.0004, 0.0127, -0.0022, -0.0502, 0.0362, 0.0624, -0.0012, - -0.0515, 0.0303, -0.0357, -0.0420, 0.0321, -0.0162, 0.0007, -0.0272, - 0.0227, 0.0187, -0.0459, 0.0496 + 0.0676, -0.0235, 0.0690, -0.0713, -0.0287, 0.0053, -0.0463, 0.0495, + -0.0222, -0.0504, 0.0064, -0.0204, 0.0168, 0.0127, -0.0113, -0.0586, + -0.0544, 0.0114, -0.0077, 0.0666, -0.0663, -0.0420, -0.0698, -0.0314, + 0.0272, 0.0361, -0.0537, 0.0301, 0.0036, -0.0472, -0.0499, 0.0114, + 0.0182, 0.0008, -0.0132, -0.0803, -0.0087, -0.0017, 0.0598, -0.0627, + 0.0859, 0.0117, 0.0105, 0.0309, -0.0370, -0.0111, -0.0262, 0.0338, + 0.0141, -0.0385, 0.0547, 0.0648, -0.0370, 0.0107, -0.0629, -0.0163, + 0.0282, -0.0670, 0.0161, -0.0244, -0.0030, 0.0038, -0.0208, 0.0005, + 0.0125, 0.0608, -0.0089, 0.0026, 0.0562, -0.0678, 0.0841, -0.0265, + -0.0461, -0.0124, 0.0276, 0.0364, 0.0195, -0.0309, -0.0337, -0.0603, + -0.0252, -0.0356, 0.0221, 0.0184, -0.0154, -0.0136, -0.0277, 0.0283, + 0.0495, 0.0185, -0.0357, 0.0305, -0.0052, -0.0432, -0.0135, -0.0554, + -0.0094, 0.0272, 0.1030, 0.0049, 0.0012, -0.0140, 0.0146, -0.0979, + 0.0487, 0.0122, -0.0204, 0.0496, -0.0055, -0.0015, -0.0170, 0.0053, + 0.0104, -0.0742, 0.0742, -0.0381, 0.0104, -0.0065, -0.0564, 0.0453, + -0.0057, -0.0029, -0.0722, 0.0094, -0.0561, 0.0284, 0.0402, 0.0233, + -0.0716, -0.0424, 0.0165, -0.0505, 0.0006, 0.0219, -0.0601, 0.0656, + -0.0175, -0.0524, 0.0355, 0.0007, -0.0042, -0.0443, 0.0871, -0.0403, + -0.0031, 0.0171, -0.0359, -0.0520, -0.0344, 0.0239, 0.0099, 0.0004, + 0.0235, 0.0238, -0.0153, 0.0501, -0.0052, 0.0162, 0.0313, -0.0121, + 0.0009, -0.0366, -0.0628, 0.0386, -0.0671, 0.0480, -0.0595, 0.0568, + -0.0604, -0.0540, 0.0403, -0.0187, 0.0649, 0.0029, -0.0003, 0.0020, + -0.0056, 0.0471, -0.0145, -0.0126, -0.0395, -0.0455, -0.0437, 0.0056, + 0.0331, 0.0004, 0.0127, -0.0022, -0.0502, 0.0362, 0.0624, -0.0012, + -0.0515, 0.0303, -0.0357, -0.0420, 0.0321, -0.0162, 0.0007, -0.0272, + 0.0227, 0.0187, -0.0459, 0.0496 ]]), decimal=3 ) @@ -59,47 +59,47 @@ def test_rainbow_model_cuda(self): tt.assert_almost_equal( out.cpu(), torch.tensor([[ - -1.4765e-02, -4.0353e-02, -2.1705e-02, -2.2314e-02, 3.6881e-02, - -1.4175e-02, 1.2442e-02, -6.8713e-03, 2.4970e-02, 2.5681e-02, - -4.5859e-02, -2.3327e-02, 3.6205e-02, 7.1024e-03, -2.7564e-02, - 2.1592e-02, -3.2728e-02, 1.3602e-02, -1.1690e-02, -4.3082e-02, - -1.2996e-02, 1.7184e-02, 1.3446e-02, -3.3587e-03, -4.6350e-02, - -1.7646e-02, 2.1954e-02, 8.5546e-03, -2.1359e-02, -2.4206e-02, - -2.3151e-02, -3.6330e-02, 4.4699e-02, 3.9887e-03, 1.5609e-02, - -4.3950e-02, 1.0955e-02, -2.4277e-02, 1.4915e-02, 3.2508e-03, - 6.1454e-02, 3.5242e-02, -1.5274e-02, -2.6729e-02, -2.4072e-02, - 1.5696e-02, 2.6622e-02, -3.5404e-02, 5.1701e-02, -5.3047e-02, - -1.8412e-02, 8.6640e-03, -3.1722e-02, 4.0329e-02, 1.2896e-02, + -1.4765e-02, -4.0353e-02, -2.1705e-02, -2.2314e-02, 3.6881e-02, + -1.4175e-02, 1.2442e-02, -6.8713e-03, 2.4970e-02, 2.5681e-02, + -4.5859e-02, -2.3327e-02, 3.6205e-02, 7.1024e-03, -2.7564e-02, + 2.1592e-02, -3.2728e-02, 1.3602e-02, -1.1690e-02, -4.3082e-02, + -1.2996e-02, 1.7184e-02, 1.3446e-02, -3.3587e-03, -4.6350e-02, + -1.7646e-02, 2.1954e-02, 8.5546e-03, -2.1359e-02, -2.4206e-02, + -2.3151e-02, -3.6330e-02, 4.4699e-02, 3.9887e-03, 1.5609e-02, + -4.3950e-02, 1.0955e-02, -2.4277e-02, 1.4915e-02, 3.2508e-03, + 6.1454e-02, 3.5242e-02, -1.5274e-02, -2.6729e-02, -2.4072e-02, + 1.5696e-02, 2.6622e-02, -3.5404e-02, 5.1701e-02, -5.3047e-02, + -1.8412e-02, 8.6640e-03, -3.1722e-02, 4.0329e-02, 1.2896e-02, -1.4139e-02, -4.9200e-02, -4.6193e-02, -2.9064e-03, -2.2078e-02, -4.0084e-02, -8.3519e-03, -2.7589e-02, -4.9979e-03, -1.6055e-02, - -4.5311e-02, -2.6951e-02, 2.8032e-02, -4.0069e-03, 3.2405e-02, - -5.3164e-03, -3.0139e-03, 6.6179e-04, -4.9243e-02, 3.2515e-02, - 9.8307e-03, -3.4257e-03, -3.9522e-02, 1.2594e-02, -2.7210e-02, - 2.3451e-02, 4.2257e-02, 2.2239e-02, 1.4304e-04, 4.2905e-04, - 1.5193e-02, 3.1897e-03, -1.0828e-02, -4.8345e-02, 6.8747e-02, - -7.1725e-03, -9.7815e-03, -1.6331e-02, 1.0434e-02, -8.8083e-04, - 3.8219e-02, 6.8332e-03, -2.0189e-02, 2.8141e-02, 1.4913e-02, - -2.4925e-02, -2.8922e-02, -7.1546e-03, 1.9791e-02, 1.1160e-02, - 1.0306e-02, -1.3631e-02, 2.7318e-03, 1.4050e-03, -8.2064e-03, - 3.5836e-02, -1.5877e-02, -1.1198e-02, 1.9514e-02, 3.0832e-03, - -6.2730e-02, 6.1493e-03, -1.2340e-02, 3.9110e-02, -2.6895e-02, - -5.1718e-03, 7.5017e-03, 1.2673e-03, 4.7525e-02, 1.7373e-03, - -5.1745e-03, -2.8621e-02, 3.4984e-02, -3.2622e-02, 1.0748e-02, - 1.2499e-02, -1.8788e-02, -8.6717e-03, 4.3620e-02, 2.8460e-02, - -6.8146e-03, -3.5824e-02, 9.2931e-03, 3.7893e-03, 2.4187e-02, - 1.3393e-02, -5.9393e-03, -9.9837e-03, -8.1019e-03, -2.1840e-02, - -3.8945e-02, 1.6736e-02, -4.7475e-02, 4.9770e-02, 3.4695e-02, - 1.8961e-02, 2.7416e-02, -1.3578e-02, -9.8595e-03, 2.2834e-03, - 2.4829e-02, -4.3998e-02, 3.2398e-02, -1.4200e-02, 2.4907e-02, - -2.2542e-02, -9.2765e-03, 2.0658e-03, -4.1246e-03, -1.8095e-02, - -1.2732e-02, -3.2090e-03, 1.3127e-02, -2.0888e-02, 1.4931e-02, - -4.0576e-02, 4.2877e-02, 7.9411e-05, -4.4377e-02, 3.2357e-03, - 1.6201e-02, 4.0387e-02, -1.9023e-02, 5.8033e-02, -3.3424e-02, - 2.9598e-03, -1.8526e-02, -2.2967e-02, 4.3449e-02, -1.2564e-02, + -4.5311e-02, -2.6951e-02, 2.8032e-02, -4.0069e-03, 3.2405e-02, + -5.3164e-03, -3.0139e-03, 6.6179e-04, -4.9243e-02, 3.2515e-02, + 9.8307e-03, -3.4257e-03, -3.9522e-02, 1.2594e-02, -2.7210e-02, + 2.3451e-02, 4.2257e-02, 2.2239e-02, 1.4304e-04, 4.2905e-04, + 1.5193e-02, 3.1897e-03, -1.0828e-02, -4.8345e-02, 6.8747e-02, + -7.1725e-03, -9.7815e-03, -1.6331e-02, 1.0434e-02, -8.8083e-04, + 3.8219e-02, 6.8332e-03, -2.0189e-02, 2.8141e-02, 1.4913e-02, + -2.4925e-02, -2.8922e-02, -7.1546e-03, 1.9791e-02, 1.1160e-02, + 1.0306e-02, -1.3631e-02, 2.7318e-03, 1.4050e-03, -8.2064e-03, + 3.5836e-02, -1.5877e-02, -1.1198e-02, 1.9514e-02, 3.0832e-03, + -6.2730e-02, 6.1493e-03, -1.2340e-02, 3.9110e-02, -2.6895e-02, + -5.1718e-03, 7.5017e-03, 1.2673e-03, 4.7525e-02, 1.7373e-03, + -5.1745e-03, -2.8621e-02, 3.4984e-02, -3.2622e-02, 1.0748e-02, + 1.2499e-02, -1.8788e-02, -8.6717e-03, 4.3620e-02, 2.8460e-02, + -6.8146e-03, -3.5824e-02, 9.2931e-03, 3.7893e-03, 2.4187e-02, + 1.3393e-02, -5.9393e-03, -9.9837e-03, -8.1019e-03, -2.1840e-02, + -3.8945e-02, 1.6736e-02, -4.7475e-02, 4.9770e-02, 3.4695e-02, + 1.8961e-02, 2.7416e-02, -1.3578e-02, -9.8595e-03, 2.2834e-03, + 2.4829e-02, -4.3998e-02, 3.2398e-02, -1.4200e-02, 2.4907e-02, + -2.2542e-02, -9.2765e-03, 2.0658e-03, -4.1246e-03, -1.8095e-02, + -1.2732e-02, -3.2090e-03, 1.3127e-02, -2.0888e-02, 1.4931e-02, + -4.0576e-02, 4.2877e-02, 7.9411e-05, -4.4377e-02, 3.2357e-03, + 1.6201e-02, 4.0387e-02, -1.9023e-02, 5.8033e-02, -3.3424e-02, + 2.9598e-03, -1.8526e-02, -2.2967e-02, 4.3449e-02, -1.2564e-02, -9.3756e-03, -2.1745e-02, -2.7089e-02, -3.6791e-02, -5.2018e-02, - 2.4588e-02, 1.0037e-03, 3.9753e-02, 4.3534e-02, 2.6446e-02, - -1.1808e-02, 2.1426e-02, 7.5522e-03, 2.2847e-03, -2.7211e-02, - 4.1364e-02, -1.1281e-02, 1.6523e-03, -1.9913e-03 + 2.4588e-02, 1.0037e-03, 3.9753e-02, 4.3534e-02, 2.6446e-02, + -1.1808e-02, 2.1426e-02, 7.5522e-03, 2.2847e-03, -2.7211e-02, + 4.1364e-02, -1.1281e-02, 1.6523e-03, -1.9913e-03 ]]), decimal=3 ) @@ -112,34 +112,35 @@ def test_rainbow_model_cuda(self): out.cpu(), torch.tensor([[ -0.0247, -0.0172, -0.0633, -0.0154, -0.0156, -0.1156, -0.0793, -0.0184, - -0.0408, 0.0005, -0.0920, -0.0481, -0.0597, -0.0243, 0.0006, -0.1045, + -0.0408, 0.0005, -0.0920, -0.0481, -0.0597, -0.0243, 0.0006, -0.1045, -0.0476, -0.0030, -0.0230, -0.0869, -0.0149, -0.0412, -0.0753, -0.0640, -0.1106, -0.0632, -0.0645, -0.0474, -0.0124, -0.0698, -0.0275, -0.0415, -0.0916, -0.0957, -0.0851, -0.1296, -0.1049, -0.0196, -0.0823, -0.0380, -0.1085, -0.0526, -0.0083, -0.1274, -0.0426, -0.0183, -0.0585, -0.0366, -0.1111, -0.0074, -0.1238, -0.0324, -0.0166, -0.0719, -0.0285, -0.0427, - -0.1158, -0.0569, 0.0075, -0.0419, -0.0288, -0.1189, -0.0220, -0.0370, - 0.0040, 0.0228, -0.0958, -0.0258, -0.0276, -0.0405, -0.0958, -0.0201, - -0.0639, -0.0543, -0.0705, -0.0940, -0.0700, -0.0921, -0.0426, 0.0026, + -0.1158, -0.0569, 0.0075, -0.0419, -0.0288, -0.1189, -0.0220, -0.0370, + 0.0040, 0.0228, -0.0958, -0.0258, -0.0276, -0.0405, -0.0958, -0.0201, + -0.0639, -0.0543, -0.0705, -0.0940, -0.0700, -0.0921, -0.0426, 0.0026, -0.0556, -0.0439, -0.0386, -0.0957, -0.0915, -0.0679, -0.1272, -0.0754, -0.0076, -0.1046, -0.0350, -0.0887, -0.0350, -0.0270, -0.1188, -0.0449, - 0.0020, -0.0406, 0.0011, -0.0842, -0.0422, -0.1280, -0.0205, 0.0002, + 0.0020, -0.0406, 0.0011, -0.0842, -0.0422, -0.1280, -0.0205, 0.0002, -0.0789, -0.0185, -0.0510, -0.1180, -0.0550, -0.0159, -0.0702, -0.0029, - -0.0891, -0.0253, -0.0485, -0.0128, 0.0010, -0.0870, -0.0230, -0.0233, + -0.0891, -0.0253, -0.0485, -0.0128, 0.0010, -0.0870, -0.0230, -0.0233, -0.0411, -0.0870, -0.0419, -0.0688, -0.0583, -0.0448, -0.0864, -0.0926, - -0.0758, -0.0540, 0.0058, -0.0843, -0.0365, -0.0608, -0.0787, -0.0938, - -0.0680, -0.0995, -0.0764, 0.0061, -0.0821, -0.0636, -0.0848, -0.0373, + -0.0758, -0.0540, 0.0058, -0.0843, -0.0365, -0.0608, -0.0787, -0.0938, + -0.0680, -0.0995, -0.0764, 0.0061, -0.0821, -0.0636, -0.0848, -0.0373, -0.0285, -0.1086, -0.0464, -0.0228, -0.0464, -0.0279, -0.1053, -0.0224, -0.1268, -0.0006, -0.0186, -0.0836, -0.0011, -0.0415, -0.1222, -0.0668, - -0.0015, -0.0535, -0.0071, -0.1202, -0.0257, -0.0503, 0.0004, 0.0099, + -0.0015, -0.0535, -0.0071, -0.1202, -0.0257, -0.0503, 0.0004, 0.0099, -0.1113, -0.0182, -0.0080, -0.0216, -0.0661, -0.0115, -0.0468, -0.0716, -0.0404, -0.0950, -0.0681, -0.0933, -0.0699, -0.0154, -0.0853, -0.0414, -0.0403, -0.0700, -0.0685, -0.0975, -0.0934, -0.1016, -0.0121, -0.1084, - -0.0391, -0.1006, -0.0441, -0.0024, -0.1232, -0.0159, 0.0012, -0.0480, + -0.0391, -0.1006, -0.0441, -0.0024, -0.1232, -0.0159, 0.0012, -0.0480, -0.0013, -0.0789, -0.0309, -0.1101 ]]), decimal=3 ) + if __name__ == "__main__": unittest.main() diff --git a/all/presets/atari/ppo.py b/all/presets/atari/ppo.py index aba6720c..543741d6 100644 --- a/all/presets/atari/ppo.py +++ b/all/presets/atari/ppo.py @@ -1,47 +1,54 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import PPO +from all.agents import PPO, PPOTestAgent from all.bodies import DeepmindAtariBody from all.approximation import VNetwork, FeatureNetwork from all.logging import DummyWriter from all.optim import LinearScheduler from all.policies import SoftmaxPolicy -from .models import nature_features, nature_value_head, nature_policy_head - - -def ppo( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=2.5e-4, # Adam learning rate - eps=1e-5, # Adam stability - # Other optimization settings - clip_grad=0.5, - entropy_loss_scaling=0.01, - value_loss_scaling=0.5, - clip_initial=0.1, - clip_final=0.01, - # Batch settings - epochs=4, - minibatches=4, - n_envs=8, - n_steps=128, - # GAE settings - lam=0.95, - # Model construction - feature_model_constructor=nature_features, - value_model_constructor=nature_value_head, - policy_model_constructor=nature_policy_head -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.atari.models import nature_features, nature_value_head, nature_policy_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 2.5e-4, + "eps": 1e-5, + # Other optimization settings + "clip_grad": 0.5, + "entropy_loss_scaling": 0.01, + "value_loss_scaling": 0.5, + "clip_initial": 0.1, + "clip_final": 0.01, + # Batch settings + "epochs": 4, + "minibatches": 4, + "n_envs": 8, + "n_steps": 128, + # GAE settings + "lam": 0.95, + # Model construction + "feature_model_constructor": nature_features, + "value_model_constructor": nature_value_head, + "policy_model_constructor": nature_policy_head +} + + +class PPOAtariPreset(ParallelPreset): """ - PPO Atari preset. + Proximal Policy Optimization (PPO) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. clip_grad (float): The maximum magnitude of the gradient for any given parameter. @@ -50,7 +57,7 @@ def ppo( value_loss_scaling (float): Coefficient for the value function loss. clip_initial (float): Value for epsilon in the clipped PPO objective function at the beginning of training. clip_final (float): Value for epsilon in the clipped PPO objective function at the end of training. - epochs (int): Number of times to iterature through each batch. + epochs (int): Number of times to literature through each batch. minibatches (int): The number of minibatches to split each batch into. n_envs (int): Number of parallel actors. n_steps (int): Length of each rollout. @@ -59,54 +66,43 @@ def ppo( value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _ppo(envs, writer=DummyWriter()): - env = envs[0] - # Update epoch * minibatches times per update, - # but we only update once per n_steps, - # with n_envs and 4 frames per step - final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor']().to(device) - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_model = feature_model_constructor().to(device) + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps * self.hyperparameters['epochs'] * self.hyperparameters['minibatches'] / (self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) - feature_optimizer = Adam( - feature_model.parameters(), lr=lr, eps=eps - ) - value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( - feature_model, + self.feature_model, feature_optimizer, - clip_grad=clip_grad, - scheduler=CosineAnnealingLR( - feature_optimizer, - final_anneal_step - ), + scheduler=CosineAnnealingLR(feature_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, - loss_scaling=value_loss_scaling, - clip_grad=clip_grad, - writer=writer, - scheduler=CosineAnnealingLR( - value_optimizer, - final_anneal_step - ), + scheduler=CosineAnnealingLR(value_optimizer, n_updates), + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], + writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - clip_grad=clip_grad, - writer=writer, - scheduler=CosineAnnealingLR( - policy_optimizer, - final_anneal_step - ), + scheduler=CosineAnnealingLR(policy_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], + writer=writer ) return DeepmindAtariBody( @@ -115,25 +111,28 @@ def _ppo(envs, writer=DummyWriter()): v, policy, epsilon=LinearScheduler( - clip_initial, - clip_final, + self.hyperparameters["clip_initial"], + self.hyperparameters["clip_final"], 0, - final_anneal_step, + n_updates, name='clip', writer=writer ), - epochs=epochs, - minibatches=minibatches, - n_envs=n_envs, - n_steps=n_steps, - discount_factor=discount_factor, - lam=lam, - entropy_loss_scaling=entropy_loss_scaling, + epochs=self.hyperparameters["epochs"], + minibatches=self.hyperparameters["minibatches"], + n_envs=self.hyperparameters["n_envs"], + n_steps=self.hyperparameters["n_steps"], + discount_factor=self.hyperparameters["discount_factor"], + lam=self.hyperparameters["lam"], + entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer, ) ) - return _ppo, n_envs + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return DeepmindAtariBody(PPOTestAgent(features, policy)) -__all__ = ["ppo"] +ppo = ParallelPresetBuilder('ppo', default_hyperparameters, PPOAtariPreset) diff --git a/all/presets/atari/rainbow.py b/all/presets/atari/rainbow.py index 8004035d..59c8304d 100644 --- a/all/presets/atari/rainbow.py +++ b/all/presets/atari/rainbow.py @@ -1,53 +1,59 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QDist, FixedTarget -from all.agents import Rainbow +from all.agents import Rainbow, RainbowTestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.memory import PrioritizedReplayBuffer, NStepReplayBuffer from all.optim import LinearScheduler -from .models import nature_rainbow +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.atari.models import nature_rainbow -def rainbow( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=1e-4, - eps=1.5e-4, - # Training settings - minibatch_size=32, - update_frequency=4, - target_update_frequency=1000, - # Replay buffer settings - replay_start_size=80000, - replay_buffer_size=1000000, - # Explicit exploration - initial_exploration=0.02, - final_exploration=0., - # Prioritized replay settings - alpha=0.5, - beta=0.5, - # Multi-step learning - n_steps=3, - # Distributional RL - atoms=51, - v_min=-10, - v_max=10, - # Noisy Nets - sigma=0.5, - # Model construction - model_constructor=nature_rainbow -): +default_hyperparameters = { + "discount_factor": 0.99, + "lr": 1e-4, + "eps": 1.5e-4, + # Training settings + "minibatch_size": 32, + "update_frequency": 4, + "target_update_frequency": 1000, + # Replay buffer settings + "replay_start_size": 80000, + "replay_buffer_size": 1000000, + # Explicit exploration + "initial_exploration": 0.02, + "final_exploration": 0., + "test_exploration": 0.001, + # Prioritized replay settings + "alpha": 0.5, + "beta": 0.5, + # Multi-step learning + "n_steps": 3, + # Distributional RL + "atoms": 51, + "v_min": -10, + "v_max": 10, + # Noisy Nets + "sigma": 0.5, + # Model construction + "model_constructor": nature_rainbow +} + + +class RainbowAtariPreset(Preset): """ - Rainbow Atari Preset. + Rainbow DQN Atari Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. @@ -55,9 +61,9 @@ def rainbow( target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, + initial_exploration (float): Initial probability of choosing a random action, decayed over course of training. - final_exploration (int): Final probability of choosing a random action. + final_exploration (float): Final probability of choosing a random action. alpha (float): Amount of prioritization in the prioritized experience replay buffer. (0 = no prioritization, 1 = full prioritization) beta (float): The strength of the importance sampling correction for prioritized experience replay. @@ -66,53 +72,80 @@ def rainbow( atoms (int): The number of atoms in the categorical distribution used to represent the distributional value function. v_min (int): The expected return corresponding to the smallest atom. - v_max (int): The expected return correspodning to the larget atom. + v_max (int): The expected return corresponding to the largest atom. sigma (float): Initial noisy network noise. model_constructor (function): The function used to construct the neural model. """ - def _rainbow(env, writer=DummyWriter()): - action_repeat = 4 - last_timestep = last_frame / action_repeat - last_update = (last_timestep - replay_start_size) / update_frequency - model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) - optimizer = Adam(model.parameters(), lr=lr, eps=eps) - q = QDist( - model, + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env, atoms=hyperparameters["atoms"], sigma=hyperparameters["sigma"]).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] + + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + + q_dist = QDist( + self.model, optimizer, - env.action_space.n, - atoms, - scheduler=CosineAnnealingLR(optimizer, last_update), - v_min=v_min, - v_max=v_max, - target=FixedTarget(target_update_frequency), + self.n_actions, + self.hyperparameters['atoms'], + scheduler=CosineAnnealingLR(optimizer, n_updates), + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) - replay_buffer = PrioritizedReplayBuffer( - replay_buffer_size, - alpha=alpha, - beta=beta, - device=device + + replay_buffer = NStepReplayBuffer( + self.hyperparameters['n_steps'], + self.hyperparameters['discount_factor'], + PrioritizedReplayBuffer( + self.hyperparameters['replay_buffer_size'], + alpha=self.hyperparameters['alpha'], + beta=self.hyperparameters['beta'], + device=self.device + ) ) - replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) - agent = Rainbow( - q, - replay_buffer, - exploration=LinearScheduler( - initial_exploration, - final_exploration, - 0, - last_timestep, - name='exploration', - writer=writer + return DeepmindAtariBody( + Rainbow( + q_dist, + replay_buffer, + exploration=LinearScheduler( + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + 0, + train_steps - self.hyperparameters['replay_start_size'], + name="exploration", + writer=writer + ), + discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], + minibatch_size=self.hyperparameters['minibatch_size'], + replay_start_size=self.hyperparameters['replay_start_size'], + update_frequency=self.hyperparameters['update_frequency'], + writer=writer, ), - discount_factor=discount_factor ** n_steps, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, - writer=writer, + lazy_frames=True, + episodic_lives=True ) - return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True) - return _rainbow + def test_agent(self): + q_dist = QDist( + copy.deepcopy(self.model), + None, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + ) + return DeepmindAtariBody(RainbowTestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"])) + + +rainbow = PresetBuilder('rainbow', default_hyperparameters, RainbowAtariPreset) diff --git a/all/presets/atari/vac.py b/all/presets/atari/vac.py index 1418559f..43f08298 100644 --- a/all/presets/atari/vac.py +++ b/all/presets/atari/vac.py @@ -1,37 +1,46 @@ +import copy from torch.optim import Adam -from all.agents import VAC +from torch.optim.lr_scheduler import CosineAnnealingLR +from all.agents import VAC, VACTestAgent from all.approximation import VNetwork, FeatureNetwork from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import nature_features, nature_value_head, nature_policy_head - - -def vac( - # Common settings - device="cuda", - discount_factor=0.99, - # Adam optimizer settings - lr_v=5e-4, - lr_pi=1e-4, - eps=1.5e-4, - # Other optimization settings - clip_grad=0.5, - value_loss_scaling=0.25, - # Parallel actors - n_envs=16, - # Model construction - feature_model_constructor=nature_features, - value_model_constructor=nature_value_head, - policy_model_constructor=nature_policy_head -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.atari.models import nature_features, nature_value_head, nature_policy_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr_v": 5e-4, + "lr_pi": 1e-4, + "eps": 1.5e-4, + # Other optimization settings + "clip_grad": 0.5, + "value_loss_scaling": 0.25, + # Parallel actors + "n_envs": 16, + # Model construction + "feature_model_constructor": nature_features, + "value_model_constructor": nature_value_head, + "policy_model_constructor": nature_policy_head +} + + +class VACAtariPreset(ParallelPreset): """ - Vanilla Actor-Critic Atari preset. + Vanilla Actor-Critic (VAC) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr_v (float): Learning rate for value network. lr_pi (float): Learning rate for policy network and feature network. eps (float): Stability parameters for the Adam optimizer. @@ -43,36 +52,53 @@ def vac( value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _vac(envs, writer=DummyWriter()): - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(envs[0]).to(device) - feature_model = feature_model_constructor().to(device) - value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) - feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor']().to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters["n_envs"] + + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + + features = FeatureNetwork( + self.feature_model, + feature_optimizer, + scheduler=CosineAnnealingLR(feature_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], + writer=writer + ) v = VNetwork( - value_model, + self.value_model, value_optimizer, - loss_scaling=value_loss_scaling, - clip_grad=clip_grad, - writer=writer, + scheduler=CosineAnnealingLR(value_optimizer, n_updates), + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], + writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - clip_grad=clip_grad, - writer=writer, - ) - features = FeatureNetwork( - feature_model, - feature_optimizer, - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(policy_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( - VAC(features, v, policy, discount_factor=discount_factor), + VAC(features, v, policy, discount_factor=self.hyperparameters["discount_factor"]), ) - return _vac, n_envs + + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return DeepmindAtariBody(VACTestAgent(features, policy)) + + +vac = ParallelPresetBuilder('vac', default_hyperparameters, VACAtariPreset) diff --git a/all/presets/atari/vpg.py b/all/presets/atari/vpg.py index 62a18d04..4eb007ff 100644 --- a/all/presets/atari/vpg.py +++ b/all/presets/atari/vpg.py @@ -1,37 +1,45 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import VPG +from all.agents import VPG, VPGTestAgent from all.approximation import VNetwork, FeatureNetwork from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import nature_features, nature_value_head, nature_policy_head - - -def vpg( - # Common settings - device="cuda", - discount_factor=0.99, - last_frame=40e6, - # Adam optimizer settings - lr=7e-4, - eps=1.5e-4, - # Other optimization settings - clip_grad=0.5, - value_loss_scaling=0.25, - min_batch_size=1000, - # Model construction - feature_model_constructor=nature_features, - value_model_constructor=nature_value_head, - policy_model_constructor=nature_policy_head -): +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.atari.models import nature_features, nature_value_head, nature_policy_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr_v": 5e-4, + "lr_pi": 1e-4, + "eps": 1.5e-4, + # Other optimization settings + "clip_grad": 0.5, + "value_loss_scaling": 0.25, + "min_batch_size": 1000, + # Model construction + "feature_model_constructor": nature_features, + "value_model_constructor": nature_value_head, + "policy_model_constructor": nature_policy_head +} + + +class VPGAtariPreset(Preset): """ - Vanilla Policy Gradient Atari preset. + Vanilla Policy Gradient (VPG) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. clip_grad (float): The maximum magnitude of the gradient for any given parameter. @@ -43,54 +51,53 @@ def vpg( value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - final_anneal_step = last_frame / (min_batch_size * 4) - def _vpg_atari(env, writer=DummyWriter()): - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_model = feature_model_constructor().to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor']().to(device) - feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) - value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters["min_batch_size"] + + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( - feature_model, + self.feature_model, feature_optimizer, - scheduler=CosineAnnealingLR( - feature_optimizer, - final_anneal_step, - ), - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(feature_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, - scheduler=CosineAnnealingLR( - value_optimizer, - final_anneal_step, - ), - loss_scaling=value_loss_scaling, - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(value_optimizer, n_updates), + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - scheduler=CosineAnnealingLR( - policy_optimizer, - final_anneal_step, - ), - clip_grad=clip_grad, + scheduler=CosineAnnealingLR(policy_optimizer, n_updates), + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( - VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), - episodic_lives=True + VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), ) - return _vpg_atari + + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return DeepmindAtariBody(VPGTestAgent(features, policy)) -__all__ = ["vpg"] +vpg = PresetBuilder('vpg', default_hyperparameters, VPGAtariPreset) diff --git a/all/presets/atari/vqn.py b/all/presets/atari/vqn.py index 2d66f0a8..951feff0 100644 --- a/all/presets/atari/vqn.py +++ b/all/presets/atari/vqn.py @@ -1,68 +1,100 @@ +import copy from torch.optim import Adam +from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QNetwork -from all.agents import VQN +from all.agents import VQN, VQNTestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.optim import LinearScheduler from all.policies import ParallelGreedyPolicy -from .models import nature_ddqn +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.atari.models import nature_ddqn -def vqn( - # Common settings - device="cuda", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-3, - eps=1.5e-4, - # Exploration settings - initial_exploration=1., - final_exploration=0.02, - final_exploration_frame=1000000, - # Parallel actors - n_envs=64, - # Model construction - model_constructor=nature_ddqn -): + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-3, + "eps": 1.5e-4, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0.01, + "final_exploration_step": 250000, + "test_exploration": 0.001, + # Parallel actors + "n_envs": 64, + # Model construction + "model_constructor": nature_ddqn +} + + +class VQNAtariPreset(ParallelPreset): """ - Vanilla Q-Network Atari preset. + Vanilla Q-Network (VQN) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent n_envs (int): Number of parallel environments. model_constructor (function): The function used to construct the neural model. """ - def _vqn(envs, writer=DummyWriter()): - action_repeat = 4 - final_exploration_timestep = final_exploration_frame / action_repeat - env = envs[0] - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr, eps=eps) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters['n_envs'] + + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + q = QNetwork( - model, + self.model, optimizer, + scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer ) + policy = ParallelGreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], 0, - final_exploration_timestep, - name="epsilon", + self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], + name="exploration", writer=writer ) ) + return DeepmindAtariBody( - VQN(q, policy, discount_factor=discount_factor), + VQN(q, policy, discount_factor=self.hyperparameters['discount_factor']), ) - return _vqn, n_envs + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DeepmindAtariBody( + VQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + ) + + +vqn = ParallelPresetBuilder('vqn', default_hyperparameters, VQNAtariPreset) diff --git a/all/presets/atari/vsarsa.py b/all/presets/atari/vsarsa.py index afad489a..10224f06 100644 --- a/all/presets/atari/vsarsa.py +++ b/all/presets/atari/vsarsa.py @@ -1,68 +1,100 @@ +import copy from torch.optim import Adam +from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QNetwork -from all.agents import VSarsa +from all.agents import VSarsa, VSarsaTestAgent from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.optim import LinearScheduler from all.policies import ParallelGreedyPolicy -from .models import nature_ddqn +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.atari.models import nature_ddqn -def vsarsa( - # Common settings - device="cuda", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-3, - eps=1.5e-4, - # Exploration settings - final_exploration_frame=1000000, - final_exploration=0.02, - initial_exploration=1., - # Parallel actors - n_envs=64, - # Model construction - model_constructor=nature_ddqn -): + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-3, + "eps": 1.5e-4, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0.01, + "final_exploration_step": 250000, + "test_exploration": 0.001, + # Parallel actors + "n_envs": 64, + # Model construction + "model_constructor": nature_ddqn +} + + +class VSarsaAtariPreset(ParallelPreset): """ - Vanilla SARSA Atari preset. + Vanilla SARSA (VSarsa) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent n_envs (int): Number of parallel environments. model_constructor (function): The function used to construct the neural model. """ - def _vsarsa(envs, writer=DummyWriter()): - action_repeat = 4 - final_exploration_timestep = final_exploration_frame / action_repeat - env = envs[0] - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr, eps=eps) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters['n_envs'] + + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + q = QNetwork( - model, + self.model, optimizer, + scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer ) + policy = ParallelGreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], 0, - final_exploration_timestep, - name="epsilon", + self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], + name="exploration", writer=writer ) ) + return DeepmindAtariBody( - VSarsa(q, policy, discount_factor=discount_factor), + VSarsa(q, policy, discount_factor=self.hyperparameters['discount_factor']), ) - return _vsarsa, n_envs + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DeepmindAtariBody( + VSarsaTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + ) + + +vsarsa = ParallelPresetBuilder('vsarsa', default_hyperparameters, VSarsaAtariPreset) diff --git a/all/presets/atari_test.py b/all/presets/atari_test.py index d34d34d7..8deff7ee 100644 --- a/all/presets/atari_test.py +++ b/all/presets/atari_test.py @@ -1,7 +1,8 @@ +import os import unittest import torch from all.environments import AtariEnvironment -from all.presets.validate_agent import validate_agent +from all.logging import DummyWriter from all.presets.atari import ( a2c, c51, @@ -15,105 +16,59 @@ vqn ) -CPU = torch.device("cpu") -if torch.cuda.is_available(): - CUDA = torch.device("cuda") -else: - print( - "WARNING: CUDA is not available!", - "Running presets in cpu mode.", - "Enable CUDA for full test coverage!", - ) - CUDA = torch.device("cpu") - class TestAtariPresets(unittest.TestCase): - def test_a2c(self): - validate_agent(a2c(device=CPU), AtariEnvironment("Breakout", device=CPU)) + def setUp(self): + self.env = AtariEnvironment('Breakout') + self.env.reset() - def test_a2c_cuda(self): - validate_agent(a2c(device=CUDA), AtariEnvironment("Breakout", device=CUDA)) + def tearDown(self): + if os.path.exists('test_preset.pt'): + os.remove('test_preset.pt') - def test_c51(self): - validate_agent(c51(device=CPU), AtariEnvironment("Breakout", device=CPU)) + def test_a2c(self): + self.validate_preset(a2c) - def test_c51_cuda(self): - validate_agent(c51(device=CUDA), AtariEnvironment("Breakout", device=CUDA)) + def test_c51(self): + self.validate_preset(c51) def test_ddqn(self): - validate_agent( - ddqn(replay_start_size=64, device=CPU), - AtariEnvironment("Breakout", device=CPU), - ) - - def test_ddqn_cuda(self): - validate_agent( - ddqn(replay_start_size=64, device=CUDA), - AtariEnvironment("Breakout", device=CUDA), - ) + self.validate_preset(ddqn) def test_dqn(self): - validate_agent( - dqn(replay_start_size=64, device=CPU), - AtariEnvironment("Breakout", device=CPU), - ) - - def test_dqn_cuda(self): - validate_agent( - dqn(replay_start_size=64, device=CUDA), - AtariEnvironment("Breakout", device=CUDA), - ) + self.validate_preset(dqn) def test_ppo(self): - validate_agent(ppo(device=CPU, n_envs=4), AtariEnvironment("Breakout", device=CPU)) - - def test_ppo_cuda(self): - validate_agent(ppo(device=CUDA, n_envs=4), AtariEnvironment("Breakout", device=CUDA)) + self.validate_preset(ppo) def test_rainbow(self): - validate_agent( - rainbow(replay_start_size=64, device=CPU), - AtariEnvironment("Breakout", device=CPU), - ) - - def test_rainbow_cuda(self): - validate_agent( - rainbow(replay_start_size=64, device=CUDA), - AtariEnvironment("Breakout", device=CUDA), - ) + self.validate_preset(rainbow) def test_vac(self): - validate_agent(vac(device=CPU, n_envs=4), AtariEnvironment("Breakout", device=CPU)) - - def test_vac_cuda(self): - validate_agent( - vac(device=CUDA, n_envs=4), AtariEnvironment("Breakout", device=CUDA) - ) - - def test_vpg(self): - validate_agent(vpg(device=CPU), AtariEnvironment("Breakout", device=CPU)) + self.validate_preset(vac) - def test_vpg_cuda(self): - validate_agent( - vpg(device=CUDA), AtariEnvironment("Breakout", device=CUDA) - ) + def test_vpq(self): + self.validate_preset(vpg) def test_vsarsa(self): - validate_agent(vsarsa(device=CPU, n_envs=4), AtariEnvironment("Breakout", device=CPU)) - - def test_vsarsa_cuda(self): - validate_agent( - vsarsa(device=CUDA, n_envs=4), AtariEnvironment("Breakout", device=CUDA) - ) - + self.validate_preset(vsarsa) def test_vqn(self): - validate_agent(vqn(device=CPU, n_envs=4), AtariEnvironment("Breakout", device=CPU)) - - def test_vqn_cuda(self): - validate_agent( - vqn(device=CUDA, n_envs=4), AtariEnvironment("Breakout", device=CUDA) - ) + self.validate_preset(vqn) + + def validate_preset(self, builder): + preset = builder.device('cpu').env(self.env).build() + # normal agent + agent = preset.agent(writer=DummyWriter(), train_steps=100000) + agent.act(self.env.state) + # test agent + test_agent = preset.test_agent() + test_agent.act(self.env.state) + # test save/load + preset.save('test_preset.pt') + preset = torch.load('test_preset.pt') + test_agent = preset.test_agent() + test_agent.act(self.env.state) if __name__ == "__main__": diff --git a/all/presets/builder.py b/all/presets/builder.py new file mode 100644 index 00000000..88eff793 --- /dev/null +++ b/all/presets/builder.py @@ -0,0 +1,88 @@ +from abc import ABC, abstractmethod + + +class PresetBuilder(): + def __init__( + self, + default_name, + default_hyperparameters, + constructor, + device="cuda", + env=None, + hyperparameters=None, + name=None, + ): + self.default_name = default_name + self.default_hyperparameters = default_hyperparameters + self.constructor = constructor + self._device = device + self._env = env + self._hyperparameters = self._merge_hyperparameters(default_hyperparameters, hyperparameters) + self._name = name or default_name + + def device(self, device): + return self._preset_builder(device=device) + + def env(self, env): + return self._preset_builder(env=env) + + def hyperparameters(self, **hyperparameters): + return self._preset_builder(hyperparameters=self._merge_hyperparameters(self._hyperparameters, hyperparameters)) + + def name(self, name): + return self._preset_builder(name=name) + + def build(self): + if not self._env: + raise Exception('Env is required') + + return self.constructor( + self._env, + device=self._device, + name=self._name, + **self._hyperparameters + ) + + def _merge_hyperparameters(self, h1, h2): + if h2 is None: + return h1 + for key in h2.keys(): + if key not in h1: + raise KeyError("Invalid hyperparameter: {}".format(key)) + return {**h1, **h2} + + def _preset_builder(self, **kwargs): + old_kwargs = { + 'device': self._device, + 'env': self._env, + 'hyperparameters': self._hyperparameters, + 'name': self._name, + } + return PresetBuilder(self.default_name, self.default_hyperparameters, self.constructor, **{**old_kwargs, **kwargs}) + + +class ParallelPresetBuilder(PresetBuilder): + def __init__( + self, + default_name, + default_hyperparameters, + constructor, + device="cuda", + env=None, + hyperparameters=None, + name=None, + ): + if 'n_envs' not in default_hyperparameters: + raise Exception('ParallelPreset hyperparameters must include n_envs') + super().__init__( + default_name, + default_hyperparameters, + constructor, + device=device, + env=env, + hyperparameters=hyperparameters, + name=name + ) + + def build(self): + return super().build() diff --git a/all/presets/classic_control/.DS_Store b/all/presets/classic_control/.DS_Store new file mode 100644 index 00000000..56b4cd12 Binary files /dev/null and b/all/presets/classic_control/.DS_Store differ diff --git a/all/presets/classic_control/__init__.py b/all/presets/classic_control/__init__.py index e9aaabc1..8eb72a2c 100644 --- a/all/presets/classic_control/__init__.py +++ b/all/presets/classic_control/__init__.py @@ -1,13 +1,13 @@ -from .a2c import a2c -from .c51 import c51 -from .ddqn import ddqn -from .dqn import dqn -from .ppo import ppo -from .rainbow import rainbow -from .vac import vac -from .vpg import vpg -from .vqn import vqn -from .vsarsa import vsarsa +from .a2c import a2c, A2CClassicControlPreset +from .c51 import c51, C51ClassicControlPreset +from .ddqn import ddqn, DDQNClassicControlPreset +from .dqn import dqn, DQNClassicControlPreset +from .ppo import ppo, PPOClassicControlPreset +from .rainbow import rainbow, RainbowClassicControlPreset +from .vac import vac, VACClassicControlPreset +from .vpg import vpg, VPGClassicControlPreset +from .vqn import vqn, VQNClassicControlPreset +from .vsarsa import vsarsa, VSarsaClassicControlPreset __all__ = [ "a2c", @@ -19,5 +19,5 @@ "vac", "vpg", "vqn", - "vsarsa" + "vsarsa", ] diff --git a/all/presets/classic_control/a2c.py b/all/presets/classic_control/a2c.py index 5480459e..b789dcf2 100644 --- a/all/presets/classic_control/a2c.py +++ b/all/presets/classic_control/a2c.py @@ -1,78 +1,103 @@ +import copy from torch.optim import Adam -from all.agents import A2C +from all.agents import A2C, A2CTestAgent from all.approximation import VNetwork, FeatureNetwork from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import fc_relu_features, fc_policy_head, fc_value_head +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.classic_control.models import fc_relu_features, fc_policy_head, fc_value_head -def a2c( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=3e-3, - # Other optimization settings - clip_grad=0.1, - entropy_loss_scaling=0.001, - # Batch settings - n_envs=4, - n_steps=32, - # Model construction - feature_model_constructor=fc_relu_features, - value_model_constructor=fc_value_head, - policy_model_constructor=fc_policy_head -): +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 3e-3, + # Other optimization settings + "clip_grad": 0.1, + "entropy_loss_scaling": 0.001, + "value_loss_scaling": 0.5, + # Batch settings + "n_envs": 4, + "n_steps": 32, + # Model construction + "feature_model_constructor": fc_relu_features, + "value_model_constructor": fc_value_head, + "policy_model_constructor": fc_policy_head +} + + +class A2CClassicControlPreset(ParallelPreset): """ - A2C classic control preset. + Advantaged Actor-Critic (A2C) classic control preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. - clip_grad (float): The maximum magnitude of the gradient for any given parameter. Set to 0 to disable. + eps (float): Stability parameters for the Adam optimizer. + clip_grad (float): The maximum magnitude of the gradient for any given parameter. + Set to 0 to disable. entropy_loss_scaling (float): Coefficient for the entropy term in the total loss. + value_loss_scaling (float): Coefficient for the value function loss. n_envs (int): Number of parallel environments. n_steps (int): Length of each rollout. feature_model_constructor (function): The function used to construct the neural feature model. value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _a2c(envs, writer=DummyWriter()): - env = envs[0] - feature_model = feature_model_constructor(env).to(device) - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_optimizer = Adam(feature_model.parameters(), lr=lr) - value_optimizer = Adam(value_model.parameters(), lr=lr) - policy_optimizer = Adam(policy_model.parameters(), lr=lr) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor'](env).to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork( - feature_model, feature_optimizer, clip_grad=clip_grad) + self.feature_model, + feature_optimizer, + clip_grad=self.hyperparameters["clip_grad"] + ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, - clip_grad=clip_grad, + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - clip_grad=clip_grad, + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + return A2C( features, v, policy, - n_envs=n_envs, - n_steps=n_steps, - discount_factor=discount_factor, - entropy_loss_scaling=entropy_loss_scaling, + n_envs=self.hyperparameters["n_envs"], + n_steps=self.hyperparameters["n_steps"], + discount_factor=self.hyperparameters["discount_factor"], + entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer ) - return _a2c, n_envs + + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return A2CTestAgent(features, policy) -__all__ = ["a2c"] +a2c = ParallelPresetBuilder('a2c', default_hyperparameters, A2CClassicControlPreset) diff --git a/all/presets/classic_control/c51.py b/all/presets/classic_control/c51.py index 793bd140..c800e92c 100644 --- a/all/presets/classic_control/c51.py +++ b/all/presets/classic_control/c51.py @@ -1,88 +1,121 @@ +import copy from torch.optim import Adam -from all.agents import C51 -from all.approximation import QDist +from all.agents import C51, C51TestAgent +from all.approximation import QDist, FixedTarget from all.logging import DummyWriter from all.memory import ExperienceReplayBuffer from all.optim import LinearScheduler -from .models import fc_relu_dist_q +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.classic_control.models import fc_relu_dist_q -def c51( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-4, - # Training settings - minibatch_size=128, - update_frequency=1, - # Replay buffer settings - replay_start_size=1000, - replay_buffer_size=20000, - # Exploration settings - initial_exploration=1.00, - final_exploration=0.02, - final_exploration_frame=10000, - # Distributional RL - atoms=101, - v_min=-100, - v_max=100, - # Model construction - model_constructor=fc_relu_dist_q -): +default_hyperparameters = { + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-4, + # Training settings + "minibatch_size": 128, + "update_frequency": 1, + "target_update_frequency": 100, + # Replay buffer settings + "replay_start_size": 1000, + "replay_buffer_size": 20000, + # Exploration settings + "initial_exploration": 1.00, + "final_exploration": 0.02, + "final_exploration_step": 10000, + "test_exploration": 0.001, + # Distributional RL + "atoms": 101, + "v_min": -100, + "v_max": 100, + # Model construction + "model_constructor": fc_relu_dist_q +} + + +class C51ClassicControlPreset(Preset): """ - C51 classic control preset. + Categorical DQN (C51) Atari preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. update_frequency (int): Number of timesteps per training update. + target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, + initial_exploration (float): Initial probability of choosing a random action, decayed over course of training. - final_exploration (int): Final probability of choosing a random action. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent atoms (int): The number of atoms in the categorical distribution used to represent the distributional value function. v_min (int): The expected return corresponding to the smallest atom. v_max (int): The expected return correspodning to the larget atom. model_constructor (function): The function used to construct the neural model. """ - def _c51(env, writer=DummyWriter()): - model = model_constructor(env, atoms=atoms).to(device) - optimizer = Adam(model.parameters(), lr=lr) + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env, atoms=hyperparameters['atoms']).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) + q = QDist( - model, + self.model, optimizer, - env.action_space.n, - atoms, - v_min=v_min, - v_max=v_max, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) - replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) + + replay_buffer = ExperienceReplayBuffer( + self.hyperparameters['replay_buffer_size'], + device=self.device + ) + return C51( q, replay_buffer, exploration=LinearScheduler( - initial_exploration, - final_exploration, - replay_start_size, - final_exploration_frame, + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + 0, + self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), - discount_factor=discount_factor, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, + discount_factor=self.hyperparameters["discount_factor"], + minibatch_size=self.hyperparameters["minibatch_size"], + replay_start_size=self.hyperparameters["replay_start_size"], + update_frequency=self.hyperparameters["update_frequency"], writer=writer ) - return _c51 + def test_agent(self): + q_dist = QDist( + copy.deepcopy(self.model), + None, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + ) + return C51TestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"]) -__all__ = ["c51"] +c51 = PresetBuilder('c51', default_hyperparameters, C51ClassicControlPreset) diff --git a/all/presets/classic_control/ddqn.py b/all/presets/classic_control/ddqn.py index 75cc76d6..4e1c6acb 100644 --- a/all/presets/classic_control/ddqn.py +++ b/all/presets/classic_control/ddqn.py @@ -1,92 +1,117 @@ +import copy from torch.optim import Adam -from all.agents import DDQN +from all.agents import DDQN, DDQNTestAgent from all.approximation import QNetwork, FixedTarget from all.logging import DummyWriter from all.memory import PrioritizedReplayBuffer from all.optim import LinearScheduler from all.policies import GreedyPolicy -from .models import dueling_fc_relu_q +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.classic_control.models import dueling_fc_relu_q -def ddqn( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-3, - # Training settings - minibatch_size=64, - update_frequency=1, - target_update_frequency=100, - # Replay buffer settings - replay_start_size=1000, - replay_buffer_size=10000, - # Exploration settings - initial_exploration=1., - final_exploration=0., - final_exploration_frame=10000, - # Prioritized replay settings - alpha=0.2, - beta=0.6, - # Model construction - model_constructor=dueling_fc_relu_q -): +default_hyperparameters = { + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-3, + # Training settings + "minibatch_size": 64, + "update_frequency": 1, + "target_update_frequency": 100, + # Replay buffer settings + "replay_start_size": 1000, + "replay_buffer_size": 10000, + # Exploration settings + "initial_exploration": 1., + "final_exploration": 0., + "final_exploration_step": 10000, + "test_exploration": 0.001, + # Prioritized replay settings + "alpha": 0.2, + "beta": 0.6, + # Model construction + "model_constructor": dueling_fc_relu_q +} + + +class DDQNClassicControlPreset(Preset): """ - Dueling Double DQN with Prioritized Experience Replay (PER). + Dueling Double DQN (DDQN) with Prioritized Experience Replay (PER) Classic Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. update_frequency (int): Number of timesteps per training update. target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent alpha (float): Amount of prioritization in the prioritized experience replay buffer. (0 = no prioritization, 1 = full prioritization) beta (float): The strength of the importance sampling correction for prioritized experience replay. (0 = no correction, 1 = full correction) model_constructor (function): The function used to construct the neural model. """ - def _ddqn(env, writer=DummyWriter()): - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr) + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) + q = QNetwork( - model, + self.model, optimizer, - target=FixedTarget(target_update_frequency), + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) + policy = GreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, - replay_start_size, - final_exploration_frame, - name="epsilon", + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + self.hyperparameters['replay_start_size'], + self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], + name="exploration", writer=writer ) ) + replay_buffer = PrioritizedReplayBuffer( - replay_buffer_size, - alpha=alpha, - beta=beta, - device=device + self.hyperparameters['replay_buffer_size'], + alpha=self.hyperparameters['alpha'], + beta=self.hyperparameters['beta'], + device=self.device + ) + + return DDQN( + q, + policy, + replay_buffer, + discount_factor=self.hyperparameters["discount_factor"], + minibatch_size=self.hyperparameters["minibatch_size"], + replay_start_size=self.hyperparameters["replay_start_size"], + update_frequency=self.hyperparameters["update_frequency"], ) - return DDQN(q, policy, replay_buffer, - discount_factor=discount_factor, - replay_start_size=replay_start_size, - update_frequency=update_frequency, - minibatch_size=minibatch_size) - return _ddqn + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DDQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) -__all__ = ["ddqn"] +ddqn = PresetBuilder('ddqn', default_hyperparameters, DDQNClassicControlPreset) diff --git a/all/presets/classic_control/dqn.py b/all/presets/classic_control/dqn.py index 0f6c5b97..354c9576 100644 --- a/all/presets/classic_control/dqn.py +++ b/all/presets/classic_control/dqn.py @@ -1,84 +1,109 @@ +import copy from torch.optim import Adam -from all.agents import DQN +from all.agents import DQN, DQNTestAgent from all.approximation import QNetwork, FixedTarget from all.logging import DummyWriter from all.memory import ExperienceReplayBuffer from all.optim import LinearScheduler from all.policies import GreedyPolicy -from .models import fc_relu_q +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.classic_control.models import fc_relu_q -def dqn( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-3, - # Training settings - minibatch_size=64, - update_frequency=1, - target_update_frequency=100, - # Replay buffer settings - replay_start_size=1000, - replay_buffer_size=10000, - # Exploration settings - initial_exploration=1., - final_exploration=0., - final_exploration_frame=10000, - # Model construction - model_constructor=fc_relu_q -): +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-3, + # Training settings + "minibatch_size": 64, + "update_frequency": 1, + "target_update_frequency": 100, + # Replay buffer settings + "replay_start_size": 1000, + "replay_buffer_size": 10000, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0., + "final_exploration_step": 10000, + "test_exploration": 0.001, + # Model construction + "model_constructor": fc_relu_q +} + + +class DQNClassicControlPreset(Preset): """ - DQN classic control preset. + Deep Q-Network (DQN) Classic Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. - discount_factor (float): Discount factor for future rewards. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: + discount_factor (float, optional): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. update_frequency (int): Number of timesteps per training update. target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. - initial_exploration (int): Initial probability of choosing a random action, - decayed until final_exploration_frame. - final_exploration (int): Final probability of choosing a random action. - final_exploration_frame (int): The frame where the exploration decay stops. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent model_constructor (function): The function used to construct the neural model. """ - def _dqn(env, writer=DummyWriter()): - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr) + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) + q = QNetwork( - model, + self.model, optimizer, - target=FixedTarget(target_update_frequency), + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) + policy = GreedyPolicy( q, - env.action_space.n, + self.n_actions, epsilon=LinearScheduler( - initial_exploration, - final_exploration, - replay_start_size, - final_exploration_frame, - name="epsilon", + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + self.hyperparameters['replay_start_size'], + self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], + name="exploration", writer=writer ) ) + replay_buffer = ExperienceReplayBuffer( - replay_buffer_size, device=device) + self.hyperparameters['replay_buffer_size'], + device=self.device + ) + return DQN( q, policy, replay_buffer, - discount_factor=discount_factor, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, + discount_factor=self.hyperparameters['discount_factor'], + minibatch_size=self.hyperparameters['minibatch_size'], + replay_start_size=self.hyperparameters['replay_start_size'], + update_frequency=self.hyperparameters['update_frequency'], ) - return _dqn + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return DQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) -__all__ = ["dqn"] +dqn = PresetBuilder('dqn', default_hyperparameters, DQNClassicControlPreset) diff --git a/all/presets/classic_control/ppo.py b/all/presets/classic_control/ppo.py index 72fce453..b46c7cca 100644 --- a/all/presets/classic_control/ppo.py +++ b/all/presets/classic_control/ppo.py @@ -1,44 +1,63 @@ +import copy from torch.optim import Adam -from all.agents import PPO +from torch.optim.lr_scheduler import CosineAnnealingLR +from all.agents import PPO, PPOTestAgent +from all.bodies import DeepmindAtariBody from all.approximation import VNetwork, FeatureNetwork from all.logging import DummyWriter +from all.optim import LinearScheduler from all.policies import SoftmaxPolicy -from .models import fc_relu_features, fc_policy_head, fc_value_head - -def ppo( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-3, - # Other optimization settings - clip_grad=0.1, - entropy_loss_scaling=0.001, - epsilon=0.2, - # Batch settings - epochs=4, - minibatches=4, - n_envs=8, - n_steps=8, - # GAE settings - lam=0.95, - # Model construction - feature_model_constructor=fc_relu_features, - value_model_constructor=fc_value_head, - policy_model_constructor=fc_policy_head -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.classic_control.models import fc_relu_features, fc_policy_head, fc_value_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-3, + "eps": 1.5e-4, + # Other optimization settings + "clip_grad": 0.1, + "entropy_loss_scaling": 0.001, + "value_loss_scaling": 0.5, + "clip_initial": 0.1, + "clip_final": 0.01, + # Batch settings + "epochs": 4, + "minibatches": 4, + "n_envs": 8, + "n_steps": 8, + # GAE settings + "lam": 0.95, + # Model construction + "feature_model_constructor": fc_relu_features, + "value_model_constructor": fc_value_head, + "policy_model_constructor": fc_policy_head +} + + +class PPOClassicControlPreset(ParallelPreset): """ - PPO classic control preset. + Proximal Policy Optimization (PPO) Classic Control preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. + eps (float): Stability parameters for the Adam optimizer. clip_grad (float): The maximum magnitude of the gradient for any given parameter. Set to 0 to disable. entropy_loss_scaling (float): Coefficient for the entropy term in the total loss. - epsilon (float): Value for epsilon in the clipped PPO objective function. - epochs (int): Number of times to iterature through each batch. + value_loss_scaling (float): Coefficient for the value function loss. + clip_initial (float): Value for epsilon in the clipped PPO objective function at the beginning of training. + clip_final (float): Value for epsilon in the clipped PPO objective function at the end of training. + epochs (int): Number of times to literature through each batch. minibatches (int): The number of minibatches to split each batch into. n_envs (int): Number of parallel actors. n_steps (int): Length of each rollout. @@ -47,44 +66,68 @@ def ppo( value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _ppo(envs, writer=DummyWriter()): - env = envs[0] - feature_model = feature_model_constructor(env).to(device) - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_optimizer = Adam(feature_model.parameters(), lr=lr) - value_optimizer = Adam(value_model.parameters(), lr=lr) - policy_optimizer = Adam(policy_model.parameters(), lr=lr) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor'](env).to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps * self.hyperparameters['epochs'] * self.hyperparameters['minibatches'] / (self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) + + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( - feature_model, feature_optimizer, clip_grad=clip_grad) + self.feature_model, + feature_optimizer, + clip_grad=self.hyperparameters["clip_grad"], + writer=writer + ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, - clip_grad=clip_grad, + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, - clip_grad=clip_grad, + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + return PPO( features, v, policy, - epsilon=epsilon, - epochs=epochs, - lam=lam, - minibatches=minibatches, - n_envs=n_envs, - n_steps=n_steps, - discount_factor=discount_factor, - entropy_loss_scaling=entropy_loss_scaling, - writer=writer + epsilon=LinearScheduler( + self.hyperparameters["clip_initial"], + self.hyperparameters["clip_final"], + 0, + n_updates, + name='clip', + writer=writer + ), + epochs=self.hyperparameters["epochs"], + minibatches=self.hyperparameters["minibatches"], + n_envs=self.hyperparameters["n_envs"], + n_steps=self.hyperparameters["n_steps"], + discount_factor=self.hyperparameters["discount_factor"], + lam=self.hyperparameters["lam"], + entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], + writer=writer, ) - return _ppo, n_envs -__all__ = ["ppo"] + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return PPOTestAgent(features, policy) + + +ppo = ParallelPresetBuilder('ppo', default_hyperparameters, PPOClassicControlPreset) diff --git a/all/presets/classic_control/rainbow.py b/all/presets/classic_control/rainbow.py index 199b1a42..3610849b 100644 --- a/all/presets/classic_control/rainbow.py +++ b/all/presets/classic_control/rainbow.py @@ -1,51 +1,69 @@ +import copy from torch.optim import Adam -from all.agents import Rainbow -from all.approximation import QDist +from torch.optim.lr_scheduler import CosineAnnealingLR +from all.approximation import QDist, FixedTarget +from all.agents import Rainbow, RainbowTestAgent +from all.bodies import DeepmindAtariBody from all.logging import DummyWriter -from all.memory import ( - PrioritizedReplayBuffer, - NStepReplayBuffer, -) -from .models import fc_relu_rainbow +from all.memory import PrioritizedReplayBuffer, NStepReplayBuffer +from all.optim import LinearScheduler +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.classic_control.models import fc_relu_rainbow -def rainbow( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=2e-4, - # Training settings - minibatch_size=64, - update_frequency=1, - # Replay buffer settings - replay_buffer_size=20000, - replay_start_size=1000, - # Prioritized replay settings - alpha=0.5, - beta=0.5, - # Multi-step learning - n_steps=5, - # Distributional RL - atoms=101, - v_min=-100, - v_max=100, - # Noisy Nets - sigma=0.5, - # Model construction - model_constructor=fc_relu_rainbow -): +default_hyperparameters = { + "discount_factor": 0.99, + "lr": 2e-4, + "eps": 1.5e-4, + # Training settings + "minibatch_size": 64, + "update_frequency": 1, + "target_update_frequency": 100, + # Replay buffer settings + "replay_start_size": 1000, + "replay_buffer_size": 20000, + # Explicit exploration + "initial_exploration": 0.02, + "final_exploration": 0., + "test_exploration": 0.001, + # Prioritized replay settings + "alpha": 0.5, + "beta": 0.5, + # Multi-step learning + "n_steps": 5, + # Distributional RL + "atoms": 101, + "v_min": -101, + "v_max": 101, + # Noisy Nets + "sigma": 0.5, + # Model construction + "model_constructor": fc_relu_rainbow +} + + +class RainbowClassicControlPreset(Preset): """ - Rainbow classic control preset. + Rainbow DQN Classic Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. + eps (float): Stability parameters for the Adam optimizer. minibatch_size (int): Number of experiences to sample in each training update. update_frequency (int): Number of timesteps per training update. + target_update_frequency (int): Number of timesteps between updates the target network. replay_start_size (int): Number of experiences in replay buffer when training begins. replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. alpha (float): Amount of prioritization in the prioritized experience replay buffer. (0 = no prioritization, 1 = full prioritization) beta (float): The strength of the importance sampling correction for prioritized experience replay. @@ -54,42 +72,73 @@ def rainbow( atoms (int): The number of atoms in the categorical distribution used to represent the distributional value function. v_min (int): The expected return corresponding to the smallest atom. - v_max (int): The expected return correspodning to the larget atom. + v_max (int): The expected return corresponding to the largest atom. sigma (float): Initial noisy network noise. model_constructor (function): The function used to construct the neural model. """ - def _rainbow(env, writer=DummyWriter()): - model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) - optimizer = Adam(model.parameters(), lr=lr) - q = QDist( - model, + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env, atoms=hyperparameters["atoms"], sigma=hyperparameters["sigma"]).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + + q_dist = QDist( + self.model, optimizer, - env.action_space.n, - atoms, - v_min=v_min, - v_max=v_max, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) - # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) - replay_buffer = PrioritizedReplayBuffer( - replay_buffer_size, - alpha=alpha, - beta=beta, - device=device + + replay_buffer = NStepReplayBuffer( + self.hyperparameters['n_steps'], + self.hyperparameters['discount_factor'], + PrioritizedReplayBuffer( + self.hyperparameters['replay_buffer_size'], + alpha=self.hyperparameters['alpha'], + beta=self.hyperparameters['beta'], + device=self.device + ) ) - replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) + return Rainbow( - q, + q_dist, replay_buffer, - exploration=0., - discount_factor=discount_factor ** n_steps, - minibatch_size=minibatch_size, - replay_start_size=replay_start_size, - update_frequency=update_frequency, + exploration=LinearScheduler( + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + 0, + train_steps - self.hyperparameters['replay_start_size'], + name="exploration", + writer=writer + ), + discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], + minibatch_size=self.hyperparameters['minibatch_size'], + replay_start_size=self.hyperparameters['replay_start_size'], + update_frequency=self.hyperparameters['update_frequency'], writer=writer, ) - return _rainbow + def test_agent(self): + q_dist = QDist( + copy.deepcopy(self.model), + None, + self.n_actions, + self.hyperparameters['atoms'], + v_min=self.hyperparameters['v_min'], + v_max=self.hyperparameters['v_max'], + ) + return RainbowTestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"]) -__all__ = ["rainbow"] +rainbow = PresetBuilder('rainbow', default_hyperparameters, RainbowClassicControlPreset) diff --git a/all/presets/classic_control/vac.py b/all/presets/classic_control/vac.py index e56def30..5d8e5892 100644 --- a/all/presets/classic_control/vac.py +++ b/all/presets/classic_control/vac.py @@ -1,49 +1,97 @@ +import copy from torch.optim import Adam -from all.agents import VAC +from torch.optim.lr_scheduler import CosineAnnealingLR +from all.agents import VAC, VACTestAgent from all.approximation import VNetwork, FeatureNetwork +from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import fc_relu_features, fc_policy_head, fc_value_head - - -def vac( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr_v=5e-3, - lr_pi=1e-3, - eps=1e-5, - # Model construction - feature_model_constructor=fc_relu_features, - value_model_constructor=fc_value_head, - policy_model_constructor=fc_policy_head -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.classic_control.models import fc_relu_features, fc_policy_head, fc_value_head + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr_v": 5e-4, + "lr_pi": 1e-4, + "eps": 1.5e-4, + # Other optimization settings + "clip_grad": 0.5, + "value_loss_scaling": 0.25, + # Parallel actors + "n_envs": 16, + # Model construction + "feature_model_constructor": fc_relu_features, + "value_model_constructor": fc_value_head, + "policy_model_constructor": fc_policy_head +} + + +class VACClassicControlPreset(ParallelPreset): """ - Vanilla Actor-Critic classic control preset. + Vanilla Actor-Critic (VAC) Classic Control preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr_v (float): Learning rate for value network. lr_pi (float): Learning rate for policy network and feature network. eps (float): Stability parameters for the Adam optimizer. + clip_grad (float): The maximum magnitude of the gradient for any given parameter. + Set to 0 to disable. + value_loss_scaling (float): Coefficient for the value function loss. + n_envs (int): Number of parallel environments. feature_model_constructor (function): The function used to construct the neural feature model. value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _vac(env, writer=DummyWriter()): - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_model = feature_model_constructor(env).to(device) - value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) - feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor'](env).to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + + features = FeatureNetwork( + self.feature_model, + feature_optimizer, + clip_grad=self.hyperparameters["clip_grad"], + writer=writer + ) + + v = VNetwork( + self.value_model, + value_optimizer, + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], + writer=writer + ) + + policy = SoftmaxPolicy( + self.policy_model, + policy_optimizer, + clip_grad=self.hyperparameters["clip_grad"], + writer=writer + ) + + return VAC(features, v, policy, discount_factor=self.hyperparameters["discount_factor"]) + + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return VACTestAgent(features, policy) - v = VNetwork(value_model, value_optimizer, writer=writer) - policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer) - features = FeatureNetwork(feature_model, feature_optimizer) - return VAC(features, v, policy, discount_factor=discount_factor) - return _vac +vac = ParallelPresetBuilder('vac', default_hyperparameters, VACClassicControlPreset) diff --git a/all/presets/classic_control/vpg.py b/all/presets/classic_control/vpg.py index 05a050ce..cc0cbe2a 100644 --- a/all/presets/classic_control/vpg.py +++ b/all/presets/classic_control/vpg.py @@ -1,64 +1,96 @@ +import copy from torch.optim import Adam -from all.agents import VPG +from torch.optim.lr_scheduler import CosineAnnealingLR +from all.agents import VPG, VPGTestAgent from all.approximation import VNetwork, FeatureNetwork +from all.bodies import DeepmindAtariBody from all.logging import DummyWriter from all.policies import SoftmaxPolicy -from .models import fc_relu_features, fc_policy_head, fc_value_head +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.classic_control.models import fc_relu_features, fc_policy_head, fc_value_head -def vpg( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=5e-3, - # Batch settings - min_batch_size=500, - # Model construction - feature_model_constructor=fc_relu_features, - value_model_constructor=fc_value_head, - policy_model_constructor=fc_policy_head -): +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr_v": 5e-3, + "lr_pi": 1e-4, + "eps": 1.5e-4, + # Other optimization settings + "clip_grad": 0.5, + "value_loss_scaling": 0.25, + "min_batch_size": 500, + # Model construction + "feature_model_constructor": fc_relu_features, + "value_model_constructor": fc_value_head, + "policy_model_constructor": fc_policy_head +} + + +class VPGClassicControlPreset(Preset): """ - Vanilla Policy Gradient classic control preset. + Vanilla Policy Gradient (VPG) Classic Control preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. + eps (float): Stability parameters for the Adam optimizer. + clip_grad (float): The maximum magnitude of the gradient for any given parameter. + Set to 0 to disable. + value_loss_scaling (float): Coefficient for the value function loss. min_batch_size (int): Continue running complete episodes until at least this many states have been seen since the last update. feature_model_constructor (function): The function used to construct the neural feature model. value_model_constructor (function): The function used to construct the neural value model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _vpg(env, writer=DummyWriter()): - feature_model = feature_model_constructor(env).to(device) - value_model = value_model_constructor().to(device) - policy_model = policy_model_constructor(env).to(device) - feature_optimizer = Adam(feature_model.parameters(), lr=lr) - value_optimizer = Adam(value_model.parameters(), lr=lr) - policy_optimizer = Adam(policy_model.parameters(), lr=lr) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters['value_model_constructor']().to(device) + self.policy_model = hyperparameters['policy_model_constructor'](env).to(device) + self.feature_model = hyperparameters['feature_model_constructor'](env).to(device) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( - feature_model, + self.feature_model, feature_optimizer, + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + v = VNetwork( - value_model, + self.value_model, value_optimizer, + loss_scaling=self.hyperparameters["value_loss_scaling"], + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) + policy = SoftmaxPolicy( - policy_model, + self.policy_model, policy_optimizer, + clip_grad=self.hyperparameters["clip_grad"], writer=writer ) - return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size) - return _vpg + + return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]) + + def test_agent(self): + features = FeatureNetwork(copy.deepcopy(self.feature_model)) + policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) + return VPGTestAgent(features, policy) -__all__ = ["vpg"] +vpg = PresetBuilder('vpg', default_hyperparameters, VPGClassicControlPreset) diff --git a/all/presets/classic_control/vqn.py b/all/presets/classic_control/vqn.py index bb95d291..609ce356 100644 --- a/all/presets/classic_control/vqn.py +++ b/all/presets/classic_control/vqn.py @@ -1,42 +1,96 @@ +import copy from torch.optim import Adam -from all.agents import VQN +from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QNetwork -from all.policies import ParallelGreedyPolicy +from all.agents import VQN, VQNTestAgent +from all.bodies import DeepmindAtariBody from all.logging import DummyWriter -from .models import fc_relu_q - -def vqn( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-2, - eps=1e-5, - # Exploration settings - epsilon=0.1, - # Parallel actors - n_envs=8, - # Model construction - model_constructor=fc_relu_q -): +from all.optim import LinearScheduler +from all.policies import ParallelGreedyPolicy +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.classic_control.models import dueling_fc_relu_q + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-2, + "eps": 1.5e-4, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0., + "final_exploration_step": 10000, + "test_exploration": 0.001, + # Parallel actors + "n_envs": 8, + # Model construction + "model_constructor": dueling_fc_relu_q +} + + +class VQNClassicControlPreset(ParallelPreset): """ - Vanilla Q-Network classic control preset. + Vanilla Q-Network (VQN) Classic Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. - epsilon (int): Probability of choosing a random action. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent n_envs (int): Number of parallel environments. model_constructor (function): The function used to construct the neural model. """ - def _vqn(envs, writer=DummyWriter()): - env = envs[0] - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr, eps=eps) - q = QNetwork(model, optimizer, writer=writer) - policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon) - return VQN(q, policy, discount_factor=discount_factor) - return _vqn, n_envs - \ No newline at end of file + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters['n_envs'] + + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + + q = QNetwork( + self.model, + optimizer, + scheduler=CosineAnnealingLR(optimizer, n_updates), + writer=writer + ) + + policy = ParallelGreedyPolicy( + q, + self.n_actions, + epsilon=LinearScheduler( + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + 0, + self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], + name="exploration", + writer=writer + ) + ) + + return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor']) + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return VQNTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + + +vqn = ParallelPresetBuilder('vqn', default_hyperparameters, VQNClassicControlPreset) diff --git a/all/presets/classic_control/vsarsa.py b/all/presets/classic_control/vsarsa.py index 73e52d5b..e26b16e5 100644 --- a/all/presets/classic_control/vsarsa.py +++ b/all/presets/classic_control/vsarsa.py @@ -1,42 +1,96 @@ +import copy from torch.optim import Adam -from all.agents import VSarsa +from torch.optim.lr_scheduler import CosineAnnealingLR from all.approximation import QNetwork -from all.policies import ParallelGreedyPolicy +from all.agents import VSarsa, VSarsaTestAgent +from all.bodies import DeepmindAtariBody from all.logging import DummyWriter -from .models import fc_relu_q - -def vsarsa( - # Common settings - device="cpu", - discount_factor=0.99, - # Adam optimizer settings - lr=1e-2, - eps=1e-5, - # Exploration settings - epsilon=0.1, - # Parallel actors - n_envs=8, - # Model construction - model_constructor=fc_relu_q -): +from all.optim import LinearScheduler +from all.policies import ParallelGreedyPolicy +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.classic_control.models import dueling_fc_relu_q + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.99, + # Adam optimizer settings + "lr": 1e-2, + "eps": 1.5e-4, + # Explicit exploration + "initial_exploration": 1., + "final_exploration": 0., + "final_exploration_step": 10000, + "test_exploration": 0.001, + # Parallel actors + "n_envs": 8, + # Model construction + "model_constructor": dueling_fc_relu_q +} + + +class VSarsaClassicControlPreset(ParallelPreset): """ - Vanilla SARSA classic control preset. + Vanilla SARSA (VSarsa) Classic Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. - epsilon (int): Probability of choosing a random action. + initial_exploration (float): Initial probability of choosing a random action, + decayed over course of training. + final_exploration (float): Final probability of choosing a random action. + final_exploration_step (int): The step at which exploration decay is finished + test_exploration (float): The exploration rate of the test Agent n_envs (int): Number of parallel environments. model_constructor (function): The function used to construct the neural model. """ - def _vsarsa(envs, writer=DummyWriter()): - env = envs[0] - model = model_constructor(env).to(device) - optimizer = Adam(model.parameters(), lr=lr, eps=eps) - q = QNetwork(model, optimizer, writer=writer) - policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon) - return VSarsa(q, policy, discount_factor=discount_factor) - return _vsarsa, n_envs - \ No newline at end of file + + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.model = hyperparameters['model_constructor'](env).to(device) + self.n_actions = env.action_space.n + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps / self.hyperparameters['n_envs'] + + optimizer = Adam( + self.model.parameters(), + lr=self.hyperparameters['lr'], + eps=self.hyperparameters['eps'] + ) + + q = QNetwork( + self.model, + optimizer, + scheduler=CosineAnnealingLR(optimizer, n_updates), + writer=writer + ) + + policy = ParallelGreedyPolicy( + q, + self.n_actions, + epsilon=LinearScheduler( + self.hyperparameters['initial_exploration'], + self.hyperparameters['final_exploration'], + 0, + self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], + name="exploration", + writer=writer + ) + ) + + return VSarsa(q, policy, discount_factor=self.hyperparameters['discount_factor']) + + def test_agent(self): + q = QNetwork(copy.deepcopy(self.model)) + return VSarsaTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration']) + + +vsarsa = ParallelPresetBuilder('vsarsa', default_hyperparameters, VSarsaClassicControlPreset) diff --git a/all/presets/classic_control_test.py b/all/presets/classic_control_test.py index 13e62e06..3980e7ef 100644 --- a/all/presets/classic_control_test.py +++ b/all/presets/classic_control_test.py @@ -1,6 +1,8 @@ +import os import unittest +import torch from all.environments import GymEnvironment -from all.presets.validate_agent import validate_agent +from all.logging import DummyWriter from all.presets.classic_control import ( a2c, c51, @@ -16,38 +18,57 @@ class TestClassicControlPresets(unittest.TestCase): + def setUp(self): + self.env = GymEnvironment('CartPole-v0') + self.env.reset() + + def tearDown(self): + if os.path.exists('test_preset.pt'): + os.remove('test_preset.pt') + def test_a2c(self): - self.validate(a2c()) + self.validate(a2c) def test_c51(self): - self.validate(c51()) + self.validate(c51) def test_ddqn(self): - self.validate(ddqn()) + self.validate(ddqn) def test_dqn(self): - self.validate(dqn()) + self.validate(dqn) def test_ppo(self): - self.validate(ppo()) + self.validate(ppo) def test_rainbow(self): - self.validate(rainbow()) + self.validate(rainbow) def test_vac(self): - self.validate(vac()) + self.validate(vac) def test_vpg(self): - self.validate(vpg()) + self.validate(vpg) def test_vsarsa(self): - self.validate(vsarsa()) + self.validate(vsarsa) def test_vqn(self): - self.validate(vqn()) + self.validate(vqn) - def validate(self, make_agent): - validate_agent(make_agent, GymEnvironment("CartPole-v0")) + def validate(self, builder): + preset = builder.device('cpu').env(self.env).build() + # normal agent + agent = preset.agent(writer=DummyWriter(), train_steps=100000) + agent.act(self.env.state) + # test agent + test_agent = preset.test_agent() + test_agent.act(self.env.state) + # test save/load + preset.save('test_preset.pt') + preset = torch.load('test_preset.pt') + test_agent = preset.test_agent() + test_agent.act(self.env.state) if __name__ == "__main__": diff --git a/all/presets/continuous/.DS_Store b/all/presets/continuous/.DS_Store new file mode 100644 index 00000000..56b4cd12 Binary files /dev/null and b/all/presets/continuous/.DS_Store differ diff --git a/all/presets/continuous/__init__.py b/all/presets/continuous/__init__.py index 44db0322..32a98557 100644 --- a/all/presets/continuous/__init__.py +++ b/all/presets/continuous/__init__.py @@ -1,6 +1,10 @@ # from .actor_critic import actor_critic -from .ddpg import ddpg -from .ppo import ppo +from .ddpg import ddpg, DDPGContinuousPreset +from .ppo import ppo, PPOContinuousPreset from .sac import sac -__all__ = ['ddpg', 'ppo', 'sac'] +__all__ = [ + 'ddpg', + 'ppo', + 'sac', +] diff --git a/all/presets/continuous/ddpg.py b/all/presets/continuous/ddpg.py index 11c6c25d..ff5bbf29 100644 --- a/all/presets/continuous/ddpg.py +++ b/all/presets/continuous/ddpg.py @@ -1,42 +1,49 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import DDPG +from all.agents import DDPG, DDPGTestAgent from all.approximation import QContinuous, PolyakTarget from all.bodies import TimeFeature from all.logging import DummyWriter from all.policies import DeterministicPolicy from all.memory import ExperienceReplayBuffer -from .models import fc_q, fc_deterministic_policy - - -def ddpg( - # Common settings - device="cuda", - discount_factor=0.98, - last_frame=2e6, - # Adam optimizer settings - lr_q=1e-3, - lr_pi=1e-3, - # Training settings - minibatch_size=100, - update_frequency=1, - polyak_rate=0.005, - # Replay Buffer settings - replay_start_size=5000, - replay_buffer_size=1e6, - # Exploration settings - noise=0.1, - # Model construction - q_model_constructor=fc_q, - policy_model_constructor=fc_deterministic_policy -): +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.continuous.models import fc_q, fc_deterministic_policy + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.98, + # Adam optimizer settings + "lr_q": 1e-3, + "lr_pi": 1e-3, + # Training settings + "minibatch_size": 100, + "update_frequency": 1, + "polyak_rate": 0.005, + # Replay Buffer settings + "replay_start_size": 5000, + "replay_buffer_size": 1e6, + # Exploration settings + "noise": 0.1, + # Model construction + "q_model_constructor": fc_q, + "policy_model_constructor": fc_deterministic_policy +} + + +class DDPGContinuousPreset(Preset): """ DDPG continuous control preset. Args: - device (str): The device to load parameters and buffers onto for this agent.. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr_q (float): Learning rate for the Q network. lr_pi (float): Learning rate for the policy network. minibatch_size (int): Number of experiences to sample in each training update. @@ -48,53 +55,66 @@ def ddpg( q_model_constructor (function): The function used to construct the neural q model. policy_model_constructor (function): The function used to construct the neural policy model. """ - def _ddpg(env, writer=DummyWriter()): - final_anneal_step = (last_frame - replay_start_size) // update_frequency - q_model = q_model_constructor(env).to(device) - q_optimizer = Adam(q_model.parameters(), lr=lr_q) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.q_model = hyperparameters["q_model_constructor"](env).to(device) + self.policy_model = hyperparameters["policy_model_constructor"](env).to(device) + self.action_space = env.action_space + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters["replay_start_size"]) / self.hyperparameters["update_frequency"] + + q_optimizer = Adam(self.q_model.parameters(), lr=self.hyperparameters["lr_q"]) + q = QContinuous( - q_model, + self.q_model, q_optimizer, - target=PolyakTarget(polyak_rate), + target=PolyakTarget(self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR( q_optimizer, - final_anneal_step + n_updates ), writer=writer ) - policy_model = policy_model_constructor(env).to(device) - policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = DeterministicPolicy( - policy_model, + self.policy_model, policy_optimizer, - env.action_space, - target=PolyakTarget(polyak_rate), + self.action_space, + target=PolyakTarget(self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR( policy_optimizer, - final_anneal_step + n_updates ), writer=writer ) replay_buffer = ExperienceReplayBuffer( - replay_buffer_size, - device=device + self.hyperparameters["replay_buffer_size"], + device=self.device ) return TimeFeature(DDPG( q, policy, replay_buffer, - env.action_space, - noise=noise, - replay_start_size=replay_start_size, - discount_factor=discount_factor, - update_frequency=update_frequency, - minibatch_size=minibatch_size, + self.action_space, + noise=self.hyperparameters["noise"], + replay_start_size=self.hyperparameters["replay_start_size"], + discount_factor=self.hyperparameters["discount_factor"], + update_frequency=self.hyperparameters["update_frequency"], + minibatch_size=self.hyperparameters["minibatch_size"], )) - return _ddpg + + def test_agent(self): + policy = DeterministicPolicy( + copy.deepcopy(self.policy_model), + None, + self.action_space, + ) + return TimeFeature(DDPGTestAgent(policy)) -__all__ = ["ddpg"] +ddpg = PresetBuilder('ddpg', default_hyperparameters, DDPGContinuousPreset) diff --git a/all/presets/continuous/models/__init__.py b/all/presets/continuous/models/__init__.py index 4e4a2d1e..02dce45e 100644 --- a/all/presets/continuous/models/__init__.py +++ b/all/presets/continuous/models/__init__.py @@ -5,9 +5,10 @@ current timestep is used in addition to the features received from the environment. ''' -import numpy as np +import torch from all import nn + def fc_q(env, hidden1=400, hidden2=300): return nn.Sequential( nn.Linear(env.state_space.shape[0] + env.action_space.shape[0] + 1, hidden1), @@ -17,6 +18,7 @@ def fc_q(env, hidden1=400, hidden2=300): nn.Linear0(hidden2, 1), ) + def fc_v(env, hidden1=400, hidden2=300): return nn.Sequential( nn.Linear(env.state_space.shape[0] + 1, hidden1), @@ -26,6 +28,7 @@ def fc_v(env, hidden1=400, hidden2=300): nn.Linear0(hidden2, 1), ) + def fc_deterministic_policy(env, hidden1=400, hidden2=300): return nn.Sequential( nn.Linear(env.state_space.shape[0] + 1, hidden1), @@ -35,6 +38,7 @@ def fc_deterministic_policy(env, hidden1=400, hidden2=300): nn.Linear0(hidden2, env.action_space.shape[0]), ) + def fc_soft_policy(env, hidden1=400, hidden2=300): return nn.Sequential( nn.Linear(env.state_space.shape[0] + 1, hidden1), @@ -44,22 +48,20 @@ def fc_soft_policy(env, hidden1=400, hidden2=300): nn.Linear0(hidden2, env.action_space.shape[0] * 2), ) -def fc_actor_critic(env, hidden1=400, hidden2=300): - features = nn.Sequential( - nn.Linear(env.state_space.shape[0] + 1, hidden1), - nn.ReLU(), - ) - - v = nn.Sequential( - nn.Linear(hidden1, hidden2), - nn.ReLU(), - nn.Linear(hidden2, 1) - ) - policy = nn.Sequential( - nn.Linear(hidden1, hidden2), - nn.ReLU(), - nn.Linear(hidden2, env.action_space.shape[0] * 2) - ) +class fc_policy(nn.Module): + def __init__(self, env, hidden1=400, hidden2=300): + super().__init__() + self.model = nn.Sequential( + nn.Linear(env.state_space.shape[0] + 1, hidden1), + nn.Tanh(), + nn.Linear(hidden1, hidden2), + nn.Tanh(), + nn.Linear(hidden2, env.action_space.shape[0]) + ) + self.log_stds = nn.Parameter(torch.zeros(env.action_space.shape[0])) - return features, v, policy + def forward(self, x): + means = self.model(x) + stds = self.log_stds.expand(*means.shape) + return torch.cat((means, stds), 1) diff --git a/all/presets/continuous/ppo.py b/all/presets/continuous/ppo.py index fd68964e..d792b122 100644 --- a/all/presets/continuous/ppo.py +++ b/all/presets/continuous/ppo.py @@ -1,105 +1,106 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import PPO -from all.approximation import VNetwork, FeatureNetwork +from all.agents import PPO, PPOTestAgent +from all.approximation import VNetwork, FeatureNetwork, Identity from all.bodies import TimeFeature from all.logging import DummyWriter from all.optim import LinearScheduler from all.policies import GaussianPolicy -from .models import fc_actor_critic - - -def ppo( - # Common settings - device="cuda", - discount_factor=0.98, - last_frame=2e6, - # Adam optimizer settings - lr=3e-4, # Adam learning rate - eps=1e-5, # Adam stability - # Loss scaling - entropy_loss_scaling=0.01, - value_loss_scaling=0.5, - # Training settings - clip_grad=0.5, - clip_initial=0.2, - clip_final=0.01, - epochs=20, - minibatches=4, - # Batch settings - n_envs=32, - n_steps=128, - # GAE settings - lam=0.95, - # Model construction - ac_model_constructor=fc_actor_critic -): +from all.presets.builder import ParallelPresetBuilder +from all.presets.preset import ParallelPreset +from all.presets.continuous.models import fc_policy, fc_v + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.98, + # Adam optimizer settings + "lr": 3e-4, # Adam learning rate + "eps": 1e-5, # Adam stability + # Loss scaling + "entropy_loss_scaling": 0.01, + "value_loss_scaling": 0.5, + # Training settings + "clip_grad": 0.5, + "clip_initial": 0.2, + "clip_final": 0.01, + "epochs": 20, + "minibatches": 4, + # Batch settings + "n_envs": 32, + "n_steps": 128, + # GAE settings + "lam": 0.95, + # Model construction + "value_model_constructor": fc_v, + "policy_model_constructor": fc_policy, +} + + +class PPOContinuousPreset(ParallelPreset): """ - PPO continuous control preset. + Proximal Policy Optimization (PPO) Continuous Control Preset. Args: - device (str): The device to load parameters and buffers onto for this agent. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. lr (float): Learning rate for the Adam optimizer. eps (float): Stability parameters for the Adam optimizer. entropy_loss_scaling (float): Coefficient for the entropy term in the total loss. value_loss_scaling (float): Coefficient for the value function loss. - clip_grad (float): The maximum magnitude of the gradient for any given parameter. Set to 0 to disable. + clip_grad (float): Clips the gradient during training so that its L2 norm (calculated over all parameters) + # is no greater than this bound. Set to 0 to disable. clip_initial (float): Value for epsilon in the clipped PPO objective function at the beginning of training. clip_final (float): Value for epsilon in the clipped PPO objective function at the end of training. - epochs (int): Number of times to iterature through each batch. + epochs (int): Number of times to literature through each batch. minibatches (int): The number of minibatches to split each batch into. n_envs (int): Number of parallel actors. n_steps (int): Length of each rollout. lam (float): The Generalized Advantage Estimate (GAE) decay parameter. - ac_model_constructor (function): The function used to construct the neural feature, value and policy model. + value_model_constructor (function): The function used to construct the neural value model. + policy_model_constructor (function): The function used to construct the neural policy model. """ - def _ppo(envs, writer=DummyWriter()): - final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) - env = envs[0] - feature_model, value_model, policy_model = ac_model_constructor(env) - feature_model.to(device) - value_model.to(device) - policy_model.to(device) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.value_model = hyperparameters["value_model_constructor"](env).to(device) + self.policy_model = hyperparameters["policy_model_constructor"](env).to(device) + self.action_space = env.action_space - feature_optimizer = Adam( - feature_model.parameters(), lr=lr, eps=eps - ) - value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) - policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = train_steps * self.hyperparameters['epochs'] * self.hyperparameters['minibatches'] / (self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) + + value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) + + features = Identity(self.device) - features = FeatureNetwork( - feature_model, - feature_optimizer, - clip_grad=clip_grad, - scheduler=CosineAnnealingLR( - feature_optimizer, - final_anneal_step - ), - writer=writer - ) v = VNetwork( - value_model, + self.value_model, value_optimizer, - loss_scaling=value_loss_scaling, - clip_grad=clip_grad, + loss_scaling=self.hyperparameters['value_loss_scaling'], + clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR( value_optimizer, - final_anneal_step + n_updates ), ) + policy = GaussianPolicy( - policy_model, + self.policy_model, policy_optimizer, - env.action_space, - clip_grad=clip_grad, + self.action_space, + clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, - final_anneal_step + n_updates ), ) @@ -108,24 +109,26 @@ def _ppo(envs, writer=DummyWriter()): v, policy, epsilon=LinearScheduler( - clip_initial, - clip_final, + self.hyperparameters['clip_initial'], + self.hyperparameters['clip_final'], 0, - final_anneal_step, + n_updates, name='clip', writer=writer ), - epochs=epochs, - minibatches=minibatches, - n_envs=n_envs, - n_steps=n_steps, - discount_factor=discount_factor, - lam=lam, - entropy_loss_scaling=entropy_loss_scaling, + epochs=self.hyperparameters['epochs'], + minibatches=self.hyperparameters['minibatches'], + n_envs=self.hyperparameters['n_envs'], + n_steps=self.hyperparameters['n_steps'], + discount_factor=self.hyperparameters['discount_factor'], + lam=self.hyperparameters['lam'], + entropy_loss_scaling=self.hyperparameters['entropy_loss_scaling'], writer=writer, )) - return _ppo, n_envs + def test_agent(self): + policy = GaussianPolicy(copy.deepcopy(self.policy_model), space=self.action_space) + return TimeFeature(PPOTestAgent(Identity(self.device), policy)) -__all__ = ["ppo"] +ppo = ParallelPresetBuilder('ppo', default_hyperparameters, PPOContinuousPreset) diff --git a/all/presets/continuous/sac.py b/all/presets/continuous/sac.py index 5cf23331..29b9290f 100644 --- a/all/presets/continuous/sac.py +++ b/all/presets/continuous/sac.py @@ -1,47 +1,53 @@ +import copy from torch.optim import Adam from torch.optim.lr_scheduler import CosineAnnealingLR -from all.agents import SAC +from all.agents import SAC, SACTestAgent from all.approximation import QContinuous, PolyakTarget, VNetwork from all.bodies import TimeFeature from all.logging import DummyWriter from all.policies.soft_deterministic import SoftDeterministicPolicy from all.memory import ExperienceReplayBuffer -from .models import fc_q, fc_v, fc_soft_policy - - -def sac( - # Common settings - device="cuda", - discount_factor=0.98, - last_frame=2e6, - # Adam optimizer settings - lr_q=1e-3, - lr_v=1e-3, - lr_pi=1e-4, - # Training settings - minibatch_size=100, - update_frequency=2, - polyak_rate=0.005, - # Replay Buffer settings - replay_start_size=5000, - replay_buffer_size=1e6, - # Exploration settings - temperature_initial=0.1, - lr_temperature=1e-5, - entropy_target_scaling=1., - # Model construction - q1_model_constructor=fc_q, - q2_model_constructor=fc_q, - v_model_constructor=fc_v, - policy_model_constructor=fc_soft_policy -): +from all.presets.builder import PresetBuilder +from all.presets.preset import Preset +from all.presets.continuous.models import fc_q, fc_v, fc_soft_policy + + +default_hyperparameters = { + # Common settings + "discount_factor": 0.98, + # Adam optimizer settings + "lr_q": 1e-3, + "lr_v": 1e-3, + "lr_pi": 1e-4, + # Training settings + "minibatch_size": 100, + "update_frequency": 2, + "polyak_rate": 0.005, + # Replay Buffer settings + "replay_start_size": 5000, + "replay_buffer_size": 1e6, + # Exploration settings + "temperature_initial": 0.1, + "lr_temperature": 1e-5, + "entropy_target_scaling": 1., + # Model construction + "q1_model_constructor": fc_q, + "q2_model_constructor": fc_q, + "v_model_constructor": fc_v, + "policy_model_constructor": fc_soft_policy +} + + +class SACContinuousPreset(Preset): """ - SAC continuous control preset. + Soft Actor-Critic (SAC) continuous control preset. Args: - device (str): The device to load parameters and buffers onto for this agent.. - discount_factor (float): Discount factor for future rewards. - last_frame (int): Number of frames to train. + env (all.environments.AtariEnvironment): The environment for which to construct the agent. + name (str): A human-readable name for the preset. + device (torch.device): The device on which to load the agent. + + Keyword Args: lr_q (float): Learning rate for the Q networks. lr_v (float): Learning rate for the state-value networks. lr_pi (float): Learning rate for the policy network. @@ -53,70 +59,75 @@ def sac( temperature_initial (float): Initial value of the temperature parameter. lr_temperature (float): Learning rate for the temperature. Should be low compared to other learning rates. entropy_target_scaling (float): The target entropy will be -(entropy_target_scaling * env.action_space.shape[0]) - q1_model_constructor(function): The function used to construct the neural q1 model. - q2_model_constructor(function): The function used to construct the neural q2 model. - v_model_constructor(function): The function used to construct the neural v model. - policy_model_constructor(function): The function used to construct the neural policy model. + q1_model_constructor (function): The function used to construct the neural q1 model. + q2_model_constructor (function): The function used to construct the neural q2 model. + v_model_constructor (function): The function used to construct the neural v model. + policy_model_constructor (function): The function used to construct the neural policy model. """ - def _sac(env, writer=DummyWriter()): - final_anneal_step = (last_frame - replay_start_size) // update_frequency - q_1_model = q1_model_constructor(env).to(device) - q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) + def __init__(self, env, name, device, **hyperparameters): + super().__init__(name, device, hyperparameters) + self.q_1_model = hyperparameters["q1_model_constructor"](env).to(device) + self.q_2_model = hyperparameters["q2_model_constructor"](env).to(device) + self.v_model = hyperparameters["v_model_constructor"](env).to(device) + self.policy_model = hyperparameters["policy_model_constructor"](env).to(device) + self.action_space = env.action_space + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + n_updates = (train_steps - self.hyperparameters["replay_start_size"]) / self.hyperparameters["update_frequency"] + + q_1_optimizer = Adam(self.q_1_model.parameters(), lr=self.hyperparameters["lr_q"]) q_1 = QContinuous( - q_1_model, + self.q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR( q_1_optimizer, - final_anneal_step + n_updates ), writer=writer, name='q_1' ) - q_2_model = q2_model_constructor(env).to(device) - q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) + q_2_optimizer = Adam(self.q_2_model.parameters(), lr=self.hyperparameters["lr_q"]) q_2 = QContinuous( - q_2_model, + self.q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR( q_2_optimizer, - final_anneal_step + n_updates ), writer=writer, name='q_2' ) - v_model = v_model_constructor(env).to(device) - v_optimizer = Adam(v_model.parameters(), lr=lr_v) + v_optimizer = Adam(self.v_model.parameters(), lr=self.hyperparameters["lr_v"]) v = VNetwork( - v_model, + self.v_model, v_optimizer, scheduler=CosineAnnealingLR( v_optimizer, - final_anneal_step + n_updates ), - target=PolyakTarget(polyak_rate), + target=PolyakTarget(self.hyperparameters["polyak_rate"]), writer=writer, name='v', ) - policy_model = policy_model_constructor(env).to(device) - policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) + policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = SoftDeterministicPolicy( - policy_model, + self.policy_model, policy_optimizer, - env.action_space, + self.action_space, scheduler=CosineAnnealingLR( policy_optimizer, - final_anneal_step + n_updates ), writer=writer ) replay_buffer = ExperienceReplayBuffer( - replay_buffer_size, - device=device + self.hyperparameters["replay_buffer_size"], + device=self.device ) return TimeFeature(SAC( @@ -125,13 +136,19 @@ def _sac(env, writer=DummyWriter()): q_2, v, replay_buffer, - temperature_initial=temperature_initial, - entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), - lr_temperature=lr_temperature, - replay_start_size=replay_start_size, - discount_factor=discount_factor, - update_frequency=update_frequency, - minibatch_size=minibatch_size, + temperature_initial=self.hyperparameters["temperature_initial"], + entropy_target=(-self.action_space.shape[0] * self.hyperparameters["entropy_target_scaling"]), + lr_temperature=self.hyperparameters["lr_temperature"], + replay_start_size=self.hyperparameters["replay_start_size"], + discount_factor=self.hyperparameters["discount_factor"], + update_frequency=self.hyperparameters["update_frequency"], + minibatch_size=self.hyperparameters["minibatch_size"], writer=writer )) - return _sac + + def test_agent(self): + policy = SoftDeterministicPolicy(copy.deepcopy(self.policy_model), space=self.action_space) + return TimeFeature(SACTestAgent(policy)) + + +sac = PresetBuilder('sac', default_hyperparameters, SACContinuousPreset) diff --git a/all/presets/continuous_test.py b/all/presets/continuous_test.py index e0683b47..789cc060 100644 --- a/all/presets/continuous_test.py +++ b/all/presets/continuous_test.py @@ -1,20 +1,48 @@ +import os import unittest +import torch +from all.core import State from all.environments import GymEnvironment -from all.presets.validate_agent import validate_agent -from all.presets.continuous import ddpg, ppo, sac +from all.logging import DummyWriter +from all.presets.continuous import ( + ddpg, + ppo, + sac, +) + class TestContinuousPresets(unittest.TestCase): + def setUp(self): + self.env = GymEnvironment('LunarLanderContinuous-v2') + self.env.reset() + + def tearDown(self): + if os.path.exists('test_preset.pt'): + os.remove('test_preset.pt') + def test_ddpg(self): - self.validate(ddpg(replay_start_size=50, device='cpu')) + self.validate(ddpg) def test_ppo(self): - self.validate(ppo(n_envs=4, n_steps=4, epochs=4, minibatches=4, device='cpu')) + self.validate(ppo) def test_sac(self): - self.validate(sac(replay_start_size=50, device='cpu')) + self.validate(sac) + + def validate(self, builder): + preset = builder.device('cpu').env(self.env).build() + # normal agent + agent = preset.agent(writer=DummyWriter(), train_steps=100000) + agent.act(self.env.state) + # test agent + test_agent = preset.test_agent() + test_agent.act(self.env.state) + # test save/load + preset.save('test_preset.pt') + preset = torch.load('test_preset.pt') + test_agent = preset.test_agent() + test_agent.act(self.env.state) - def validate(self, make_agent): - validate_agent(make_agent, GymEnvironment('LunarLanderContinuous-v2')) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/all/presets/independent_multiagent.py b/all/presets/independent_multiagent.py new file mode 100644 index 00000000..d2aa5191 --- /dev/null +++ b/all/presets/independent_multiagent.py @@ -0,0 +1,21 @@ +from .builder import PresetBuilder +from .preset import Preset +from all.agents import IndependentMultiagent +from all.logging import DummyWriter + + +class IndependentMultiagentPreset(Preset): + def __init__(self, name, device, presets): + super().__init__(name, device, presets) + + def agent(self, writer=DummyWriter(), train_steps=float('inf')): + return IndependentMultiagent({ + agent_id: preset.agent(writer=writer, train_steps=train_steps) + for agent_id, preset in self.hyperparameters.items() + }) + + def test_agent(self): + return IndependentMultiagent({ + agent_id: preset.test_agent() + for agent_id, preset in self.hyperparameters.items() + }) diff --git a/all/presets/multiagent_atari_test.py b/all/presets/multiagent_atari_test.py new file mode 100644 index 00000000..9b6919b3 --- /dev/null +++ b/all/presets/multiagent_atari_test.py @@ -0,0 +1,42 @@ +import os +import unittest +import torch +from all.environments import MultiagentAtariEnv +from all.logging import DummyWriter +from all.presets.atari import dqn +from all.presets import IndependentMultiagentPreset + + +class TestMultiagentAtariPresets(unittest.TestCase): + def setUp(self): + self.env = MultiagentAtariEnv('pong_v1', device='cpu') + self.env.reset() + + def tearDown(self): + if os.path.exists('test_preset.pt'): + os.remove('test_preset.pt') + + def test_independent(self): + env = MultiagentAtariEnv('pong_v1', device='cpu') + presets = { + agent_id: dqn.device('cpu').env(env.subenvs[agent_id]).build() + for agent_id in env.agents + } + self.validate_preset(IndependentMultiagentPreset('independent', 'cpu', presets), env) + + def validate_preset(self, preset, env): + # normal agent + agent = preset.agent(writer=DummyWriter(), train_steps=100000) + agent.act(self.env.last()) + # test agent + test_agent = preset.test_agent() + test_agent.act(self.env.last()) + # test save/load + preset.save('test_preset.pt') + preset = torch.load('test_preset.pt') + test_agent = preset.test_agent() + test_agent.act(self.env.last()) + + +if __name__ == "__main__": + unittest.main() diff --git a/all/presets/preset.py b/all/presets/preset.py new file mode 100644 index 00000000..ca3ffb06 --- /dev/null +++ b/all/presets/preset.py @@ -0,0 +1,111 @@ +from abc import ABC, abstractmethod +import torch + + +class Preset(ABC): + """ + A Preset Agent factory. + + This class allows the user to instantiate preconfigured Agents and test Agents. + All Agents constructed by the Preset share a network model and parameters. + However, other objects, such as ReplayBuffers, are independently created for each Agent. + The Preset can be saved and loaded from disk. + """ + + def __init__(self, name, device, hyperparameters): + self.name = name + self.device = device + self.hyperparameters = hyperparameters + + @abstractmethod + def agent(self, writer=None, train_steps=float('inf')): + """ + Instantiate a training-mode Agent with the existing model. + + Args: + writer (all.logging.Writer, optional): Coefficient for the entropy term in the total loss. + train_steps (int, optional): The number of steps for which the agent will be trained. + + Returns: + all.agents.Agent: The instantiated Agent. + """ + pass + + @abstractmethod + def test_agent(self): + """ + Instansiate a test-mode Agent with the existing model. + + Returns: + all.agents.Agent: The instantiated test Agent. + """ + pass + + def save(self, filename): + """ + Save the preset and the contained model to disk. + + The preset can later be loaded using torch.load(filename), allowing + a test mode agent to be instantiated for evaluation or other purposes. + + Args: + filename (str): The path where the preset should be saved. + """ + return torch.save(self, filename) + + +class ParallelPreset(): + """ + A Preset ParallelAgent factory. + + This is the ParallelAgent version of all.presets.Preset. + This class allows the user to instantiate preconfigured ParallelAgents and test Agents. + All Agents constructed by the ParallelPreset share a network model and parameters. + However, other objects, such as ReplayBuffers, are independently created for each Agent. + The ParallelPreset can be saved and loaded from disk. + """ + + def __init__(self, name, device, hyperparameters): + self.name = name + self.device = device + self.hyperparameters = hyperparameters + + @abstractmethod + def agent(self, writer=None, train_steps=float('inf')): + """ + Instantiate a training-mode ParallelAgent with the existing model. + + Args: + writer (all.logging.Writer, optional): Coefficient for the entropy term in the total loss. + train_steps (int, optional): The number of steps for which the agent will be trained. + + Returns: + all.agents.ParallelAgent: The instantiated Agent. + """ + pass + + @abstractmethod + def test_agent(self): + """ + Instantiate a test-mode Agent with the existing model. + + Returns: + all.agents.Agent: The instantiated test Agent. + """ + pass + + @property + def n_envs(self): + return self.hyperparameters['n_envs'] + + def save(self, filename): + """ + Save the preset and the contained model to disk. + + The preset can later be loaded using torch.load(filename), allowing + a test mode agent to be instantiated for evaluation or other purposes. + + Args: + filename (str): The path where the preset should be saved. + """ + return torch.save(self, filename) diff --git a/all/presets/validate_agent.py b/all/presets/validate_agent.py deleted file mode 100644 index 4bb072de..00000000 --- a/all/presets/validate_agent.py +++ /dev/null @@ -1,21 +0,0 @@ -import os -from all.logging import DummyWriter -from all.experiments import SingleEnvExperiment, ParallelEnvExperiment - -class TestSingleEnvExperiment(SingleEnvExperiment): - def _make_writer(self, logdir, agent_name, env_name, write_loss): - os.makedirs(logdir, exist_ok=True) - return DummyWriter() - -class TestParallelEnvExperiment(ParallelEnvExperiment): - def _make_writer(self, logdir, agent_name, env_name, write_loss): - os.makedirs(logdir, exist_ok=True) - return DummyWriter() - -def validate_agent(agent, env): - if isinstance(agent, tuple): - experiment = TestParallelEnvExperiment(agent, env, quiet=True) - else: - experiment = TestSingleEnvExperiment(agent, env, quiet=True) - experiment.train(episodes=2) - experiment.test(episodes=2) diff --git a/benchmarks/atari40.png b/benchmarks/atari40.png index c843f666..4e2d8e45 100644 Binary files a/benchmarks/atari40.png and b/benchmarks/atari40.png differ diff --git a/benchmarks/atari40.py b/benchmarks/atari40.py index 867da8e4..26541cec 100644 --- a/benchmarks/atari40.py +++ b/benchmarks/atari40.py @@ -2,19 +2,21 @@ from all.presets import atari from all.environments import AtariEnvironment + def main(): agents = [ - atari.a2c(), - atari.c51(), - atari.dqn(), - atari.ddqn(), - atari.ppo(), - atari.rainbow(), + atari.a2c, + atari.c51, + atari.dqn, + atari.ddqn, + atari.ppo, + atari.rainbow, ] envs = [AtariEnvironment(env, device='cuda') for env in ['BeamRider', 'Breakout', 'Pong', 'Qbert', 'SpaceInvaders']] - SlurmExperiment(agents, envs, 10e6, sbatch_args={ + SlurmExperiment(agents, envs, 10e6, logdir='benchmarks/atari40', sbatch_args={ 'partition': '1080ti-long' }) + if __name__ == "__main__": main() diff --git a/benchmarks/pybullet.png b/benchmarks/pybullet.png index 64e4748d..1602eb76 100644 Binary files a/benchmarks/pybullet.png and b/benchmarks/pybullet.png differ diff --git a/benchmarks/pybullet.py b/benchmarks/pybullet.py index e91a9471..4d3ab924 100644 --- a/benchmarks/pybullet.py +++ b/benchmarks/pybullet.py @@ -1,31 +1,23 @@ -import pybullet -import pybullet_envs from all.experiments import SlurmExperiment from all.presets.continuous import ddpg, ppo, sac -from all.environments import GymEnvironment +from all.environments import PybulletEnvironment -def main(): - device = 'cuda' +def main(): frames = int(1e7) agents = [ - ddpg(last_frame=frames), - ppo(last_frame=frames), - sac(last_frame=frames) + ddpg, + ppo, + sac ] - envs = [GymEnvironment(env, device) for env in [ - 'AntBulletEnv-v0', - "HalfCheetahBulletEnv-v0", - 'HumanoidBulletEnv-v0', - 'HopperBulletEnv-v0', - 'Walker2DBulletEnv-v0' - ]] + envs = [PybulletEnvironment(env, device='cuda') for env in PybulletEnvironment.short_names] - SlurmExperiment(agents, envs, frames, sbatch_args={ + SlurmExperiment(agents, envs, frames, logdir='benchmarks/pybullet', sbatch_args={ 'partition': '1080ti-long' }) + if __name__ == "__main__": main() diff --git a/docs/source/conf.py b/docs/source/conf.py index d23970c1..838bd1a8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ author = 'Chris Nota' # The full version, including alpha/beta/rc tags -release = '0.3.3' +release = '0.7.0' # -- General configuration --------------------------------------------------- diff --git a/docs/source/guide/basic_concepts.rst b/docs/source/guide/basic_concepts.rst index 4c8df29c..a62e2984 100644 --- a/docs/source/guide/basic_concepts.rst +++ b/docs/source/guide/basic_concepts.rst @@ -12,12 +12,12 @@ To see what we mean by this, check out the OpenAI Baselines implementation of DQ There's a giant function called ``learn`` which accepts an environment and a bunch of hyperparameters, at the heart of which there is a control loop which calls many different functions. Which part of this function is the agent? Which part is the environment? Which part is something else? We call this implementation algorithm-based because the central abstraction is a function called ``learn`` which provides the complete specification of an algorithm. -What should the proper abstraction for agent be, then? We have to look no further than the following famous diagram: +What should the proper abstraction for agent be, then? We have to look no further than the following famous diagram from the Sutton and Barto textbook: .. image:: ./rl.jpg The definition of an ``Agent`` is simple. -It accepts a state and returns an action. +It accepts a state and reward, and returns an action. That's it. Everything else is an implementation detail. Here's the ``Agent`` interface in the autonomous-learning-library: @@ -29,12 +29,7 @@ Here's the ``Agent`` interface in the autonomous-learning-library: def act(self, state): pass - @abstractmethod - def eval(self, state): - pass - The ``act`` function is called when training the agent. -The ``eval`` function is called when evaluating the agent, e.g., after a training run has completed. When and how the ``Agent`` trains inside of this function is nobody's business except the ``Agent`` itself. When the ``Agent`` is allowed to act is determined by some outer control loop, and is not of concern to the ``Agent``. What might an implementation of ``act`` look like? Here's the act function from our DQN implementation: @@ -45,8 +40,8 @@ What might an implementation of ``act`` look like? Here's the act function from self.replay_buffer.store(self._state, self._action, state) self._train() self._state = state - self._action = self.policy(state) - return self.action + self._action = self.policy.no_grad(state) + return self._action That's it. ``_train()`` is a private helper methods. There is no reason for the control loop to know anything about these details. @@ -55,7 +50,7 @@ This approach simplifies both our ``Agent`` implementation and the control loop Separating the control loop logic from the ``Agent`` logic allows greater flexibility in the way agents are used. In fact, ``Agent`` is entirely decoupled from the ``Environment`` interface. -This means that our agents can be used outside of standard research environments, such as part of a REST api, a multi-agent system, etc. +This means that our agents can be used outside of standard research environments, such as part of a REST API, a multi-agent system, etc. Any code that passes a ``State`` is compatible with our agents. What is a ``State``? @@ -65,6 +60,29 @@ A ``StateArray`` object can be constucted by calling ``State.array(list_of_state Arbitrary entries can be added to a ``State``, and use of the ``StateArray`` abstraction ensures that these entries are combined and sliced properly. The code does not need to be tightly coupled to the shape of the data, but rather can act on the abstraction. +Parallel Agents and Multiagents +------------------------------- + +We described above the base ``Agent`` interface. +However, some algorithms do not fit this interface. +For example, a ``ParallelAgent`` accepts a ``StateArray`` rather than a ``State``. +A ``Multiagent`` accepts a ``State`` object containing a special ``Agent`` key indicating to which of the multiagents the current state belongs, +we we call a ``MultiagentState``. +Nevertheless, we stick to the spirit of having a single ``act()`` function as closely as possible. +The resulting interfaces are as follows: + +.. code-block:: python + + class ParallelAgent(ABC): + @abstractmethod + def act(self, state_array): + pass + + class Multiagent(ABC): + @abstractmethod + def act(self, multiagent_state): + pass + Function Approximation ---------------------- @@ -139,7 +157,7 @@ By encapsulating these details in ``Approximation``, we are able to follow the ` A few other quick things to note: ``f.no_grad(x)`` runs a forward pass with ``torch.no_grad()``, speeding computation. ``f.eval(x)`` does the same, but also puts the model in `eval` mode first, (e.g., ``BatchNorm`` or ``Dropout`` layers), and then puts the model back into its previous mode before returning. -``f.target(x)`` calls the *target network* (an advanced concept used in algorithms such as DQN. S, for example, David Silver's `course notes `_) associated with the ``Approximation``, also with ``torch.no_grad()``. +``f.target(x)`` calls the *target network* (an advanced concept used in algorithms such as DQN. For example, David Silver's `course notes `_) associated with the ``Approximation``, also with ``torch.no_grad()``. The ``autonomous-learning-library`` provides a few thin wrappers over ``Approximation`` for particular purposes, such as ``QNetwork``, ``VNetwork``, ``FeatureNetwork``, and several ``Policy`` implementations. Environments @@ -163,17 +181,16 @@ Below, we show how several different types of environments can be created: .. code-block:: python - from all.environments import AtariEnvironment, GymEnvironment + from all.environments import AtariEnvironment, GymEnvironment, PybulletEnvironment # create an Atari environment on the gpu env = AtariEnvironment('Breakout', device='cuda') - # create a classic control environment on the compute + # create a classic control environment on the cpu env = GymEnvironment('CartPole-v0') # create a PyBullet environment on the cpu - import pybullet_envs - env = GymEnvironment('HalfCheetahBulletEnv-v0') + env = PybulletEnvironment('cheetah') Now we can write our first control loop: @@ -208,69 +225,61 @@ The library provides a number of functions which compose these objects in specif We call such a function a ``preset``, and several such presets are contained in the ``all.presets`` package. (This is an example of the more general `factory method pattern `_). -For example, ``all.agents.vqn`` contains a high-level description of a vanilla Q-learning algorithm. -In order to actually apply this agent to a problem, for example, a classic control problem, we might define the following preset: + +For example, ``all.agents.dqn`` contains a high-level description of the DQN algorithm. +However, how do we actually instansiate a particular network architecture, choose a learning rate, etc.? +This is what presets are for. +Before we dive into the details, let us show the simplest usage in practice: .. code-block:: python - # The outer function signature contains the set of hyperparameters - def vqn( - # Common settings - device="cpu", - # Hyperparameters - discount_factor=0.99, - lr=1e-2, - exploration=0.1, - ): - # The inner function creates a closure over the hyperparameters passed into the outer function. - # It accepts an "env" object which is passed right before the Experiment begins, as well as - # the writer created by the Experiment which defines the logging parameters. - def _vqn(env, writer=DummyWriter()): - # create a pytorch model - model = nn.Sequential( - nn.Linear(env.state_space.shape[0], 64), - nn.ReLU(), - nn.Linear(64, env.action_space.n), - ).to(device) - - # create a pytorch optimizer for the model - optimizer = Adam(model.parameters(), lr=lr) - - # create an Approximation of the Q-function - q = QNetwork(model, optimizer, writer=writer) - - # create a Policy object derived from the Q-function - policy = GreedyPolicy(q, env.action_space.n, epsilon=exploration) - - # instansiate the agent - return VQN(q, policy, discount_factor=discount_factor) - - # return the inner function - return _vqn - -Notice how there is an "outer function" and an "inner" function. -This approach allows the separation of configuration and instantiation. -While this may seem redundant, it can sometimes be useful. -For example, suppose we want to run the same agent on multiple environments. -This can be done as follows: + from all.presets.atari import dqn + from all.environments import AtariEnvironment + + # create an environment + env = AtariEnvironment('Breakout') + + # configure and build the preset + preset = dqn.env(env).build() + + # use the preset to create an agent + agent = preset.agent() + +Instansiating the Agent is separated into two steps: +First we configure and build the ``Preset``, then we use the configured ``Preset`` to instansiate an ``Agent``. +Let's dig into the ``Preset`` interface first: .. code-block:: python - agent = vqn() - some_custom_runner(agent(), GymEnvironment('CartPole-v0')) - some_custom_runner(agent(), GymEnvironment('MountainCar-v0')) + class Preset(ABC): + @abstractmethod + def agent(self, writer=None, train_steps=float('inf')): + pass + + @abstractmethod + def test_agent(self): + pass + + def save(self, filename): + return torch.save(self, filename) + + +The ``agent()`` method instansiates a training ``Agent``. +The ``test_agent()`` method instansiates a test-mode ``Agent`` using the same network parameters as the training ``Agent``. +The ``save()`` then allows the ``Preset`` to be saved to a disk. +Critically, all agents created by a given instance of a ``Preset`` share the underlying network parameters. +The test agents, however, will instead copy the parameters, allowing test agents to be compared from multiple points in training. +If a ``Preset`` is loaded from disk, then we can instansiate a test ``Agent`` using the pre-trained parameters. + + -Now, each call to ``some_custom_runner`` receives a unique instance of the agent. -This is sometimes achieved in other libraries by providing a "reset" function on the agent. -We find our approach allows us to keep the ``Agent`` interface clean, -and is overall more elegant and less error prone. Experiment ---------- Finally, we have all of the components necessary to introduce the ``run_experiment`` helper function. ``run_experiment`` is the built-in control loop for running reinforcement learning experiment. -It instansiates its own ``Writer`` object, which is then passed to each of the agents, and runs each agent on each environment passed to it for some number of timesteps (frames) or episodes). +It instansiates its own ``Writer`` object for logging, which is then passed to each of the presets, and runs each agent on each environment passed to it for some number of timesteps (frames) or episodes). Here is a quick example: .. code-block:: python @@ -324,7 +333,7 @@ This is useful measuring the final performance of an agent. You can also pass optional parameters to ``run_experiment`` to change its behavior. You can set ``render=True`` to watch the agent during training (generally not recommended: it slows the agent considerably!). You can set ``quiet=True`` to silence command line output. -Lastly, you can set ``write_loss=False`` to disable writing debugging information to ``tensorboard``. +Lastly, you can set ``write_loss=False`` to disable writing loss and debugging information to ``tensorboard``. These files can become large, so this is recommended if you have limited storage! Finally, ``run_experiment`` relies on an underlying ``Experiment`` API. diff --git a/docs/source/guide/benchmark_performance.rst b/docs/source/guide/benchmark_performance.rst index 12e2a3e3..1a9348c6 100644 --- a/docs/source/guide/benchmark_performance.rst +++ b/docs/source/guide/benchmark_performance.rst @@ -37,13 +37,13 @@ For comparison, we look at the results published in the paper, `Rainbow: Combini In these results, the authors ran each agent for 50 million timesteps (200 million frames). We can see that at the 10 million timestep mark, our results are similar or slightly better. Our ``dqn`` and ``ddqn`` in particular were better almost across the board. -While there are almost certainly some minor implementation differences, +While there are some minor implementation differences (for example, we use ``Adam`` for most algorithms instead of ``RMSprop``), our agents achieved very similar behavior to the agents tested by DeepMind. PyBullet Benchmark ------------------ -[PyBullet](https://pybullet.org/wordpress/) provides a free alternative to the popular MuJoCo robotics environments. +`PyBullet `_ provides a free alternative to the popular MuJoCo robotics environments. While MuJoCo requires a license key and can be difficult for independent researchers to afford, PyBullet is free and open. Additionally, the PyBullet environments are widely considered more challenging, making them a more discriminant test bed. For these reasons, we chose to benchmark the ``all.presets.continuous`` presets using PyBullet. diff --git a/docs/source/guide/getting_started.rst b/docs/source/guide/getting_started.rst index 7515b881..34caa4df 100644 --- a/docs/source/guide/getting_started.rst +++ b/docs/source/guide/getting_started.rst @@ -4,10 +4,9 @@ Getting Started Prerequisites ------------- -The Autonomous Learning Library requires a recent version of Pytorch (>= 10.3). +The Autonomous Learning Library requires a recent version of PyTorch (~=1.8.0 recommended). Additionally, Tensorboard is required in order to enable logging. -We recommond installing these through `Conda `_. -We also strongly recommend using a machine with a fast GPU (a GTX 970 or better). +We also strongly recommend using a machine with a fast GPU (at minimum a GTX 970 or better, a GTX 1080ti or better is preferred). Installation ------------ @@ -18,11 +17,15 @@ The ``autonomous-learning-library`` can be installed from PyPi using ``pip``: pip install autonomous-learning-library -If you don't have PyTorch or Tensorboard previously installed, you can install them using: +This will only install the core library. +If you want to install all included environments, run: .. code-block:: bash - pip install autonomous-learning-library[pytorch] + pip install autonomous-learning-library[all] + +You can also install only a subset of the enviornments. +For the list of optional dependencies, take a look at the `setup.py `_. An alternate approach, that may be useful when following this tutorial, is to instead install by cloning the Github repository: @@ -30,8 +33,9 @@ An alternate approach, that may be useful when following this tutorial, is to in git clone https://github.com/cpnota/autonomous-learning-library.git cd autonomous-learning-library - pip install -e .["dev"] + pip install -e .[dev] +``dev`` will install all of the optional dependencies for developers of the repo, such as unit test and documentation dependencies, as well as all environments. If you chose to clone the repository, you can test your installation by running the unit test suite: .. code-block:: bash @@ -46,13 +50,13 @@ Running a Preset Agent The goal of the Autonomous Learning Library is to provide components for building new agents. However, the library also includes a number of "preset" agent configurations for easy benchmarking and comparison, as well as some useful scripts. -For example, an A2C agent can be run on Cart-Pole as follows: +For example, a PPO agent can be run on Cart-Pole as follows: .. code-block:: bash - all-classic CartPole-v0 ppo + all-classic CartPole-v0 a2c -The results will be written to ``runs/_a2c ``, where ```` is some some string generated by the library. +The results will be written to ``runs/a2c__``, where ```` and ```` are strings generated by the library. You can view these results and other information through `tensorboard`: .. code-block:: bash @@ -73,14 +77,14 @@ This should give you a plot similar to the following: .. image:: plot.png -In this plot, each point represents the average of the episodic returns over the last 100 episodes. +In this plot, each point represents the average of the episodic returns over the last 100 episodes for every 100 episodes. The shaded region represents the standard deviation over that interval. Finally, to watch the trained model in action, we provide a `watch` scripts for each preset module: .. code-block:: bash - all-watch-classic CartPole-v0 "runs/_a2c " + all-watch-classic CartPole-v0 runs/a2c__/preset.pt You need to find the by checking the ``runs`` directory. diff --git a/docs/source/index.rst b/docs/source/index.rst index e1ee621d..f3b311d2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -26,7 +26,7 @@ Enjoy! guide/benchmark_performance .. toctree:: - :maxdepth: 2 + :maxdepth: 4 :caption: Modules: modules/agents diff --git a/docs/source/modules/environments.rst b/docs/source/modules/environments.rst index 33485729..355b2250 100644 --- a/docs/source/modules/environments.rst +++ b/docs/source/modules/environments.rst @@ -4,5 +4,8 @@ all.environments ================= +.. automodsumm:: all.environments + .. automodule:: all.environments :members: + :inherited-members: diff --git a/docs/source/modules/presets.rst b/docs/source/modules/presets.rst index ae15fc62..7bd1e430 100644 --- a/docs/source/modules/presets.rst +++ b/docs/source/modules/presets.rst @@ -2,9 +2,12 @@ all.presets =========== .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: all.presets presets/atari presets/classic presets/continuous + +.. automodule:: all.presets + :members: diff --git a/docs/source/modules/presets/atari.rst b/docs/source/modules/presets/atari.rst index 5f455f10..bb8715b4 100644 --- a/docs/source/modules/presets/atari.rst +++ b/docs/source/modules/presets/atari.rst @@ -8,4 +8,5 @@ all.presets.atari .. automodule:: all.presets.atari :members: - \ No newline at end of file + :inherited-members: + :show-inheritance: diff --git a/docs/source/modules/presets/classic.rst b/docs/source/modules/presets/classic.rst index d998d702..f3720085 100644 --- a/docs/source/modules/presets/classic.rst +++ b/docs/source/modules/presets/classic.rst @@ -8,3 +8,4 @@ all.presets.classic_control .. automodule:: all.presets.classic_control :members: + :inherited-members: diff --git a/docs/source/modules/presets/continuous.rst b/docs/source/modules/presets/continuous.rst index 7074488f..a85073f3 100644 --- a/docs/source/modules/presets/continuous.rst +++ b/docs/source/modules/presets/continuous.rst @@ -8,3 +8,4 @@ all.presets.continuous .. automodule:: all.presets.continuous :members: + :inherited-members: diff --git a/examples/experiment.py b/examples/experiment.py index 4232e05d..9492aea4 100644 --- a/examples/experiment.py +++ b/examples/experiment.py @@ -5,15 +5,25 @@ from all.presets.classic_control import dqn, a2c from all.environments import GymEnvironment + def main(): - device = 'cpu' + DEVICE = 'cpu' + # DEVICE = 'cuda' # uncomment for gpu support timesteps = 40000 run_experiment( - [dqn(), a2c()], - [GymEnvironment('CartPole-v0', device), GymEnvironment('Acrobot-v1', device)], + [ + # DQN with default hyperparameters + dqn.device(DEVICE), + # DQN with a custom hyperparameters and a custom name. + dqn.device(DEVICE).hyperparameters(replay_buffer_size=100).name('dqn-small-buffer'), + # A2C with a custom name + a2c.device(DEVICE).name('not-dqn') + ], + [GymEnvironment('CartPole-v0', DEVICE), GymEnvironment('Acrobot-v1', DEVICE)], timesteps, ) plot_returns_100('runs', timesteps=timesteps) + if __name__ == "__main__": main() diff --git a/examples/slurm_experiment.py b/examples/slurm_experiment.py index a5e6a9c5..155fdb5b 100644 --- a/examples/slurm_experiment.py +++ b/examples/slurm_experiment.py @@ -4,15 +4,17 @@ For real experiments, you will surely need a modified version of this script. ''' from all.experiments import SlurmExperiment -from all.presets.atari import a2c +from all.presets.atari import a2c, dqn from all.environments import AtariEnvironment + def main(): device = 'cuda' envs = [AtariEnvironment(env, device) for env in ['Pong', 'Breakout', 'SpaceInvaders']] - SlurmExperiment(a2c(device=device), envs, 1e6, sbatch_args={ + SlurmExperiment([a2c.device(device), dqn.device(device)], envs, 1e6, sbatch_args={ 'partition': '1080ti-short' }) + if __name__ == "__main__": main() diff --git a/integration/atari_test.py b/integration/atari_test.py new file mode 100644 index 00000000..7ebad9f4 --- /dev/null +++ b/integration/atari_test.py @@ -0,0 +1,120 @@ +import unittest +import torch +from all.environments import AtariEnvironment +from all.presets.atari import ( + a2c, + c51, + ddqn, + dqn, + ppo, + rainbow, + vac, + vpg, + vsarsa, + vqn +) +from validate_agent import validate_agent + + +CPU = torch.device("cpu") +if torch.cuda.is_available(): + CUDA = torch.device("cuda") +else: + print( + "WARNING: CUDA is not available!", + "Running presets in cpu mode.", + "Enable CUDA for full test coverage!", + ) + CUDA = torch.device("cpu") + + +class TestAtariPresets(unittest.TestCase): + def test_a2c(self): + validate_agent(a2c.device(CPU), AtariEnvironment("Breakout", device=CPU)) + + def test_a2c_cuda(self): + validate_agent(a2c.device(CUDA), AtariEnvironment("Breakout", device=CUDA)) + + def test_c51(self): + validate_agent(c51.device(CPU), AtariEnvironment("Breakout", device=CPU)) + + def test_c51_cuda(self): + validate_agent(c51.device(CUDA), AtariEnvironment("Breakout", device=CUDA)) + + def test_ddqn(self): + validate_agent( + ddqn.device(CPU), + AtariEnvironment("Breakout", device=CPU), + ) + + def test_ddqn_cuda(self): + validate_agent( + ddqn.device(CUDA).hyperparameters(replay_start_size=64), + AtariEnvironment("Breakout", device=CUDA), + ) + + def test_dqn(self): + validate_agent( + dqn.device(CPU).hyperparameters(replay_start_size=64), + AtariEnvironment("Breakout", device=CPU), + ) + + def test_dqn_cuda(self): + validate_agent( + dqn.device(CUDA).hyperparameters(replay_start_size=64), + AtariEnvironment("Breakout", device=CUDA), + ) + + def test_ppo(self): + validate_agent(ppo.device(CPU).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CPU)) + + def test_ppo_cuda(self): + validate_agent(ppo.device(CUDA).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CUDA)) + + def test_rainbow(self): + validate_agent( + rainbow.device(CPU).hyperparameters(replay_start_size=64), + AtariEnvironment("Breakout", device=CPU), + ) + + def test_rainbow_cuda(self): + validate_agent( + rainbow.device(CUDA).hyperparameters(replay_start_size=64), + AtariEnvironment("Breakout", device=CUDA), + ) + + def test_vac(self): + validate_agent(vac.device(CPU).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CPU)) + + def test_vac_cuda(self): + validate_agent( + vac.device(CUDA).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CUDA) + ) + + def test_vpg(self): + validate_agent(vpg.device(CPU), AtariEnvironment("Breakout", device=CPU)) + + def test_vpg_cuda(self): + validate_agent( + vpg.device(CUDA), AtariEnvironment("Breakout", device=CUDA) + ) + + def test_vsarsa(self): + validate_agent(vsarsa.device(CPU).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CPU)) + + def test_vsarsa_cuda(self): + validate_agent( + vsarsa.device(CUDA).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CUDA) + ) + + def test_vqn(self): + validate_agent(vqn.device(CPU).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CPU)) + + def test_vqn_cuda(self): + validate_agent( + vqn.device(CUDA).hyperparameters(n_envs=4), AtariEnvironment("Breakout", device=CUDA) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/integration/classic_control_test.py b/integration/classic_control_test.py new file mode 100644 index 00000000..a1b49748 --- /dev/null +++ b/integration/classic_control_test.py @@ -0,0 +1,54 @@ +import unittest +from all.environments import GymEnvironment +from all.presets.classic_control import ( + a2c, + c51, + ddqn, + dqn, + ppo, + rainbow, + vac, + vpg, + vqn, + vsarsa, +) +from validate_agent import validate_agent + + +class TestClassicControlPresets(unittest.TestCase): + def test_a2c(self): + self.validate(a2c) + + def test_c51(self): + self.validate(c51) + + def test_ddqn(self): + self.validate(ddqn) + + def test_dqn(self): + self.validate(dqn) + + def test_ppo(self): + self.validate(ppo) + + def test_rainbow(self): + self.validate(rainbow) + + def test_vac(self): + self.validate(vac) + + def test_vpg(self): + self.validate(vpg) + + def test_vsarsa(self): + self.validate(vsarsa) + + def test_vqn(self): + self.validate(vqn) + + def validate(self, builder): + validate_agent(builder.device('cpu'), GymEnvironment("CartPole-v0")) + + +if __name__ == "__main__": + unittest.main() diff --git a/integration/continuous_test.py b/integration/continuous_test.py new file mode 100644 index 00000000..f1fe7cff --- /dev/null +++ b/integration/continuous_test.py @@ -0,0 +1,28 @@ +import unittest +from all.environments import GymEnvironment +from all.presets.continuous import ddpg, ppo, sac +from validate_agent import validate_agent + + +class TestContinuousPresets(unittest.TestCase): + def test_ddpg(self): + validate_agent( + ddpg.device('cpu').hyperparameters(replay_start_size=50), + GymEnvironment('LunarLanderContinuous-v2') + ) + + def test_ppo(self): + validate_agent( + ppo.device('cpu'), + GymEnvironment('LunarLanderContinuous-v2') + ) + + def test_sac(self): + validate_agent( + sac.device('cpu').hyperparameters(replay_start_size=50), + GymEnvironment('LunarLanderContinuous-v2') + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/integration/multiagent_atari_test.py b/integration/multiagent_atari_test.py new file mode 100644 index 00000000..cbc8e7c3 --- /dev/null +++ b/integration/multiagent_atari_test.py @@ -0,0 +1,40 @@ +import unittest +import torch +from all.environments import MultiagentAtariEnv +from all.presets import IndependentMultiagentPreset +from all.presets.atari import dqn +from validate_agent import validate_multiagent + + +CPU = torch.device("cpu") +if torch.cuda.is_available(): + CUDA = torch.device("cuda") +else: + print( + "WARNING: CUDA is not available!", + "Running presets in cpu mode.", + "Enable CUDA for full test coverage!", + ) + CUDA = torch.device("cpu") + + +class TestMultiagentAtariPresets(unittest.TestCase): + def test_independent(self): + env = MultiagentAtariEnv('pong_v1', max_cycles=1000, device=CPU) + presets = { + agent_id: dqn.device(CPU).env(env.subenvs[agent_id]).build() + for agent_id in env.agents + } + validate_multiagent(IndependentMultiagentPreset('independent', CPU, presets), env) + + def test_independent_cuda(self): + env = MultiagentAtariEnv('pong_v1', max_cycles=1000, device=CUDA) + presets = { + agent_id: dqn.device(CUDA).env(env.subenvs[agent_id]).build() + for agent_id in env.agents + } + validate_multiagent(IndependentMultiagentPreset('independent', CUDA, presets), env) + + +if __name__ == "__main__": + unittest.main() diff --git a/integration/validate_agent.py b/integration/validate_agent.py new file mode 100644 index 00000000..cab21138 --- /dev/null +++ b/integration/validate_agent.py @@ -0,0 +1,38 @@ +import os +from all.logging import DummyWriter +from all.experiments import SingleEnvExperiment, ParallelEnvExperiment, MultiagentEnvExperiment +from all.presets import ParallelPreset, Preset + + +class TestSingleEnvExperiment(SingleEnvExperiment): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + os.makedirs(logdir, exist_ok=True) + return DummyWriter() + + +class TestParallelEnvExperiment(ParallelEnvExperiment): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + os.makedirs(logdir, exist_ok=True) + return DummyWriter() + + +class TestMultiagentEnvExperiment(MultiagentEnvExperiment): + def _make_writer(self, logdir, agent_name, env_name, write_loss, writer): + os.makedirs(logdir, exist_ok=True) + return DummyWriter() + + +def validate_agent(agent, env): + preset = agent.env(env).build() + if isinstance(preset, ParallelPreset): + experiment = TestParallelEnvExperiment(preset, env, quiet=True) + else: + experiment = TestSingleEnvExperiment(preset, env, quiet=True) + experiment.train(episodes=2) + experiment.test(episodes=2) + + +def validate_multiagent(preset, env): + experiment = TestMultiagentEnvExperiment(preset, env, quiet=True) + experiment.train(episodes=2) + experiment.test(episodes=2) diff --git a/scripts/atari.py b/scripts/atari.py index e1f2aca5..29912dbc 100644 --- a/scripts/atari.py +++ b/scripts/atari.py @@ -3,6 +3,7 @@ from all.experiments import run_experiment from all.presets import atari + def main(): parser = argparse.ArgumentParser(description="Run an Atari benchmark.") parser.add_argument("env", help="Name of the Atari game (e.g. Pong).") @@ -18,23 +19,37 @@ def main(): "--frames", type=int, default=40e6, help="The number of training frames." ) parser.add_argument( - "--render", type=bool, default=False, help="Render the environment." + "--render", action="store_true", default=False, help="Render the environment." ) parser.add_argument( "--logdir", default='runs', help="The base logging directory." ) + parser.add_argument( + "--writer", default='tensorboard', help="The backend used for tracking experiment metrics." + ) + parser.add_argument('--hyperparameters', default=[], nargs='*') args = parser.parse_args() env = AtariEnvironment(args.env, device=args.device) + agent_name = args.agent agent = getattr(atari, agent_name) + agent = agent.device(args.device) + + # parse hyperparameters + hyperparameters = {} + for hp in args.hyperparameters: + key, value = hp.split('=') + hyperparameters[key] = type(agent.default_hyperparameters[key])(value) + agent = agent.hyperparameters(**hyperparameters) run_experiment( - agent(device=args.device, last_frame=args.frames), + agent, env, args.frames, render=args.render, - logdir=args.logdir + logdir=args.logdir, + writer=args.writer, ) diff --git a/scripts/classic.py b/scripts/classic.py index bb7a0245..8dd0ed11 100644 --- a/scripts/classic.py +++ b/scripts/classic.py @@ -16,21 +16,44 @@ def main(): help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0).", ) parser.add_argument( - "--frames", type=int, default=20000, help="The number of training frames." + "--frames", type=int, default=50000, help="The number of training frames." ) parser.add_argument( - "--render", type=bool, default=False, help="Render the environment." + "--render", action="store_true", default=False, help="Render the environment." ) parser.add_argument( "--logdir", default='runs', help="The base logging directory." ) + parser.add_argument("--writer", default='tensorboard', help="The backend used for tracking experiment metrics.") + parser.add_argument( + '--hyperparameters', + default=[], + nargs='*', + help="Custom hyperparameters, in the format hyperparameter1=value1 hyperparameter2=value2 etc." + ) args = parser.parse_args() env = GymEnvironment(args.env, device=args.device) + agent_name = args.agent agent = getattr(classic_control, agent_name) + agent = agent.device(args.device) - run_experiment(agent(device=args.device), env, args.frames, render=args.render, logdir=args.logdir) + # parse hyperparameters + hyperparameters = {} + for hp in args.hyperparameters: + key, value = hp.split('=') + hyperparameters[key] = type(agent.default_hyperparameters[key])(value) + agent = agent.hyperparameters(**hyperparameters) + + run_experiment( + agent, + env, + frames=args.frames, + render=args.render, + logdir=args.logdir, + writer=args.writer, + ) if __name__ == "__main__": diff --git a/scripts/continuous.py b/scripts/continuous.py index 28c1cbf5..679cf080 100644 --- a/scripts/continuous.py +++ b/scripts/continuous.py @@ -1,29 +1,20 @@ # pylint: disable=unused-import import argparse -import pybullet -import pybullet_envs -from all.environments import GymEnvironment +from all.environments import GymEnvironment, PybulletEnvironment from all.experiments import run_experiment from all.presets import continuous -# some example envs -# can also enter ID directly + +# see also: PybulletEnvironment.short_names ENVS = { - # classic continuous environments "mountaincar": "MountainCarContinuous-v0", "lander": "LunarLanderContinuous-v2", - # Bullet robotics environments - "ant": "AntBulletEnv-v0", - "cheetah": "HalfCheetahBulletEnv-v0", - "humanoid": "HumanoidBulletEnv-v0", - "hopper": "HopperBulletEnv-v0", - "walker": "Walker2DBulletEnv-v0" } def main(): parser = argparse.ArgumentParser(description="Run a continuous actions benchmark.") - parser.add_argument("env", help="Name of the env (see envs)") + parser.add_argument("env", help="Name of the env (e.g. 'lander', 'cheetah')") parser.add_argument( "agent", help="Name of the agent (e.g. ddpg). See presets for available agents." ) @@ -36,23 +27,46 @@ def main(): "--frames", type=int, default=2e6, help="The number of training frames." ) parser.add_argument( - "--render", type=bool, default=False, help="Render the environment." + "--render", action="store_true", default=False, help="Render the environment." ) parser.add_argument( "--logdir", default='runs', help="The base logging directory." ) + parser.add_argument("--writer", default='tensorboard', help="The backend used for tracking experiment metrics.") + parser.add_argument( + '--hyperparameters', + default=[], + nargs='*', + help="Custom hyperparameters, in the format hyperparameter1=value1 hyperparameter2=value2 etc." + ) args = parser.parse_args() if args.env in ENVS: - env_id = ENVS[args.env] + env = GymEnvironment(args.env, device=args.device) + elif 'BulletEnv' in args.env or args.env in PybulletEnvironment.short_names: + env = PybulletEnvironment(args.env, device=args.device) else: - env_id = args.env + env = GymEnvironment(args.env, device=args.device) - env = GymEnvironment(env_id, device=args.device) agent_name = args.agent agent = getattr(continuous, agent_name) + agent = agent.device(args.device) - run_experiment(agent(device=args.device), env, frames=args.frames, render=args.render, logdir=args.logdir) + # parse hyperparameters + hyperparameters = {} + for hp in args.hyperparameters: + key, value = hp.split('=') + hyperparameters[key] = type(agent.default_hyperparameters[key])(value) + agent = agent.hyperparameters(**hyperparameters) + + run_experiment( + agent, + env, + frames=args.frames, + render=args.render, + logdir=args.logdir, + writer=args.writer, + ) if __name__ == "__main__": diff --git a/scripts/multiagent_atari.py b/scripts/multiagent_atari.py new file mode 100644 index 00000000..92bd296f --- /dev/null +++ b/scripts/multiagent_atari.py @@ -0,0 +1,37 @@ +import argparse +from all.environments import MultiagentAtariEnv +from all.experiments.multiagent_env_experiment import MultiagentEnvExperiment +from all.presets import multiagent_atari + + +def main(): + parser = argparse.ArgumentParser(description="Run an multiagent Atari benchmark.") + parser.add_argument("env", help="Name of the Atari game (e.g. Pong).") + parser.add_argument( + "agent", help="Name of the agent (e.g. dqn). See presets for available agents." + ) + parser.add_argument( + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0).", + ) + parser.add_argument( + "--frames", type=int, default=40e6, help="The number of training frames." + ) + parser.add_argument( + "--render", type=bool, default=False, help="Render the environment." + ) + parser.add_argument( + "--writer", default='tensorboard', help="The backend used for tracking experiment metrics." + ) + args = parser.parse_args() + + env = MultiagentAtariEnv(args.env, device=args.device) + agent_name = args.agent + agent = getattr(multiagent_atari, agent_name) + experiment = MultiagentEnvExperiment(agent(device=args.device), env, write_loss=False, writer=args.writer) + experiment.train(frames=args.frames) + + +if __name__ == "__main__": + main() diff --git a/scripts/multiagent_atari_independent.py b/scripts/multiagent_atari_independent.py new file mode 100644 index 00000000..756aee2f --- /dev/null +++ b/scripts/multiagent_atari_independent.py @@ -0,0 +1,63 @@ +import argparse +from all.environments import MultiagentAtariEnv +from all.experiments.multiagent_env_experiment import MultiagentEnvExperiment +from all.presets import atari +from all.presets import IndependentMultiagentPreset + + +class DummyEnv(): + def __init__(self, state_space, action_space): + self.state_space = state_space + self.action_space = action_space + + +def main(): + parser = argparse.ArgumentParser(description="Run an multiagent Atari benchmark.") + parser.add_argument("env", help="Name of the Atari game (e.g. Pong).") + parser.add_argument( + "agents", nargs='*', help="List of agents." + ) + parser.add_argument( + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0).", + ) + parser.add_argument( + "--replay_buffer_size", + default=100000, + help="The size of the replay buffer, if applicable", + ) + parser.add_argument( + "--frames", type=int, default=40e6, help="The number of training frames." + ) + parser.add_argument( + "--render", action="store_true", default=False, help="Render the environment." + ) + parser.add_argument( + "--writer", default='tensorboard', help="The backend used for tracking experiment metrics." + ) + args = parser.parse_args() + + env = MultiagentAtariEnv(args.env, device=args.device) + + presets = { + agent_id: getattr(atari, agent_type).hyperparameters(replay_buffer_size=args.replay_buffer_size).device(args.device).env( + DummyEnv( + env.observation_spaces[agent_id], env.action_spaces[agent_id] + ) + ).build() + for agent_id, agent_type in zip(env.agents, args.agents) + } + + experiment = MultiagentEnvExperiment( + IndependentMultiagentPreset('Independent', args.device, presets), + env, + write_loss=False, + render=args.render, + writer=args.writer, + ) + experiment.train() + + +if __name__ == "__main__": + main() diff --git a/scripts/plot.py b/scripts/plot.py index 91eaa580..b041657a 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -9,5 +9,6 @@ def main(): args = parser.parse_args() plot_returns_100(args.logdir, timesteps=args.timesteps) + if __name__ == "__main__": main() diff --git a/scripts/release.py b/scripts/release.py index 7ba89043..475e5b80 100644 --- a/scripts/release.py +++ b/scripts/release.py @@ -3,18 +3,19 @@ from all.experiments import SlurmExperiment from all.presets import atari, classic_control, continuous + def main(): # run on gpu device = 'cuda' def get_agents(preset): - agents = [getattr(preset, agent_name) for agent_name in classic_control.__all__] + agents = [getattr(preset, agent_name) for agent_name in preset.__all__] return [agent(device=device) for agent in agents] SlurmExperiment( get_agents(atari), AtariEnvironment('Breakout', device=device), - 2e7, + 10e7, sbatch_args={ 'partition': '1080ti-long' } @@ -38,5 +39,6 @@ def get_agents(preset): } ) + if __name__ == "__main__": main() diff --git a/scripts/watch_atari.py b/scripts/watch_atari.py index dd22b6f6..d5309699 100644 --- a/scripts/watch_atari.py +++ b/scripts/watch_atari.py @@ -1,16 +1,16 @@ import argparse from all.bodies import DeepmindAtariBody from all.environments import AtariEnvironment -from all.experiments import GreedyAgent, watch +from all.experiments import load_and_watch def main(): parser = argparse.ArgumentParser(description="Run an Atari benchmark.") parser.add_argument("env", help="Name of the Atari game (e.g. Pong)") - parser.add_argument("dir", help="Directory where the agent's model was saved.") + parser.add_argument("filename", help="File where the model was saved.") parser.add_argument( "--device", - default="cpu", + default="cuda", help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( @@ -20,8 +20,8 @@ def main(): ) args = parser.parse_args() env = AtariEnvironment(args.env, device=args.device) - agent = DeepmindAtariBody(GreedyAgent.load(args.dir, env)) - watch(agent, env, fps=args.fps) + load_and_watch(args.filename, env, fps=args.fps) + if __name__ == "__main__": main() diff --git a/scripts/watch_classic.py b/scripts/watch_classic.py index bea4f14e..b9806415 100644 --- a/scripts/watch_classic.py +++ b/scripts/watch_classic.py @@ -2,18 +2,25 @@ from all.environments import GymEnvironment from all.experiments import load_and_watch + def main(): parser = argparse.ArgumentParser(description="Run an Atari benchmark.") parser.add_argument("env", help="Name of the environment (e.g. RoboschoolHalfCheetah-v1") - parser.add_argument("dir", help="Directory where the agent's model was saved.") + parser.add_argument("filename", help="File where the model was saved.") parser.add_argument( "--device", - default="cpu", + default="cuda", help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) + parser.add_argument( + "--fps", + default=60, + help="Playback speed", + ) args = parser.parse_args() env = GymEnvironment(args.env, device=args.device) - load_and_watch(args.dir, env) + load_and_watch(args.filename, env, fps=args.fps) + if __name__ == "__main__": main() diff --git a/scripts/watch_continuous.py b/scripts/watch_continuous.py index 446b8c12..29f68c2d 100644 --- a/scripts/watch_continuous.py +++ b/scripts/watch_continuous.py @@ -1,20 +1,18 @@ # pylint: disable=unused-import import argparse -import pybullet -import pybullet_envs from all.bodies import TimeFeature -from all.environments import GymEnvironment -from all.experiments import GreedyAgent, watch -from continuous import ENVS +from all.environments import GymEnvironment, PybulletEnvironment +from all.experiments import load_and_watch +from .continuous import ENVS def main(): parser = argparse.ArgumentParser(description="Watch a continuous agent.") parser.add_argument("env", help="ID of the Environment") - parser.add_argument("dir", help="Directory where the agent's model was saved.") + parser.add_argument("filename", help="File where the model was saved.") parser.add_argument( "--device", - default="cpu", + default="cuda", help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( @@ -25,13 +23,14 @@ def main(): args = parser.parse_args() if args.env in ENVS: - env_id = ENVS[args.env] + env = GymEnvironment(args.env, device=args.device) + elif 'BulletEnv' in args.env or args.env in PybulletEnvironment.short_names: + env = PybulletEnvironment(args.env, device=args.device) else: - env_id = args.env + env = GymEnvironment(args.env, device=args.device) + + load_and_watch(args.filename, env, fps=args.fps) - env = GymEnvironment(env_id, device=args.device) - agent = TimeFeature(GreedyAgent.load(args.dir, env)) - watch(agent, env, fps=args.fps) if __name__ == "__main__": main() diff --git a/scripts/watch_multiagent_atari.py b/scripts/watch_multiagent_atari.py new file mode 100644 index 00000000..7f16e536 --- /dev/null +++ b/scripts/watch_multiagent_atari.py @@ -0,0 +1,60 @@ +import argparse +import time +import torch +from all.bodies import DeepmindAtariBody +from all.environments import MultiagentAtariEnv +from all.experiments import load_and_watch + + +def watch(env, filename, fps, reload): + agent = torch.load(filename).test_agent() + + while True: + watch_episode(env, agent, fps) + if reload: + try: + agent = torch.load(filename).test_agent() + except Exception as e: + print('Warning: error reloading model: {}'.format(filename)) + print(e) + + +def watch_episode(env, agent, fps): + env.reset() + for _ in env.agent_iter(): + env.render() + state = env.last() + action = agent.act(state) + if state.done: + env.step(None) + else: + env.step(action) + time.sleep(1 / fps) + + +def main(): + parser = argparse.ArgumentParser(description="Watch pretrained multiagent atari") + parser.add_argument("env", help="Name of the Atari game (e.g. pong-v1)") + parser.add_argument("filename", help="File where the model was saved.") + parser.add_argument( + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", + ) + parser.add_argument( + "--fps", + default=30, + type=int, + help="Playback speed", + ) + parser.add_argument( + "--reload", + action="store_true", default=False, help="Reload the model from disk after every episode" + ) + args = parser.parse_args() + env = MultiagentAtariEnv(args.env, device=args.device) + watch(env, args.filename, args.fps, args.reload) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index ebad6f5d..d84f6311 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,44 @@ from setuptools import setup, find_packages + +extras = { + "atari": [ + "atari_py~=0.2.0", # atari environments + "Pillow~=7.1.2", # rendering library + ], + "box2d": [ + "box2d-py~=2.3.5", # box3d physics environments + ], + "pybullet": [ + "pybullet>=3.0.6", # open-source robotics environments + ], + "ma-atari": [ + "PettingZoo[atari]>=1.5.0", # Multiagent atari environments + "supersuit>=2.4.0", # Multiagent env wrappers + "AutoROM>=0.1.19", # Tool for downloading ROMs + ], + "test": [ + "flake8>=3.8", # linter for pep8 compliance + "autopep8>=1.5", # automatically fixes some pep8 errors + "torch-testing>=0.0.2", # pytorch assertion library + ], + "docs": [ + "sphinx>=3.2.1", # documentation library + "sphinx-autobuild>=2020.9.1", # documentation live reload + "sphinx-rtd-theme>=0.5.0", # documentation theme + "sphinx-automodapi>=0.13", # autogenerate docs for modules + ], + "comet": [ + "comet-ml>=3.2.11", # experiment tracking using Comet.ml + ] +} + +extras["all"] = extras["atari"] + extras["box2d"] + extras["pybullet"] + extras["ma-atari"] + extras["comet"] +extras["dev"] = extras["all"] + extras["test"] + extras["docs"] + extras["comet"] + setup( name="autonomous-learning-library", - version="0.6.2", + version="0.7.0", description=("A library for building reinforcement learning agents in Pytorch"), packages=find_packages(), url="https://github.com/cpnota/autonomous-learning-library.git", @@ -22,28 +58,14 @@ ], }, install_requires=[ - "gym[atari,box2d]", # common environments - "numpy", # math library - "matplotlib", # plotting library - "opencv-python>=3.,<4.",# used by atari wrappers - "pybullet", # continuous environments - "tensorboardX", # tensorboard compatibility + "gym~=0.18.0", # common environment interface + "numpy>=1.18.0", # math library + "matplotlib>=3.3.0", # plotting library + "opencv-python~=3.4.0", # used by atari wrappers + "torch~=1.8.0", # core deep learning library + "tensorboard>=2.3.0", # logging and visualization + "tensorboardX>=2.1.0", # tensorboard/pytorch compatibility + "cloudpickle>=1.2.0", # used to copy environments ], - extras_require={ - "pytorch": [ - "torch", # deep learning - "torchvision", # additional utilities - "tensorboard" # visualizations - ], - "docs": [ - "sphinx", - "sphinx-autobuild", - "sphinx-rtd-theme", - "sphinx-automodapi" - ], - "dev": [ - "pylint", # code quality tool - "torch-testing" # pytorch assertion library - ] - }, + extras_require=extras )