From 7ace59ed29d8533876936c76cfc18c307ad0aa07 Mon Sep 17 00:00:00 2001
From: Ryan Strauss <ryanrstrauss@icloud.com>
Date: Fri, 29 Dec 2023 16:00:16 -0500
Subject: [PATCH] Update documentation

---
 README.md                          | 13 +++++++++++--
 dopamax/agents/anakin/alphazero.py | 13 +++++++++++++
 dopamax/environments/pgx/base.py   |  9 +++++++++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 567a795..a4dcedb 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 Dopamax is a library containing pure [JAX][1] implementations of common reinforcement learning algorithms. _Everything_
 is implemented in JAX, including the environments. This allows for extremely fast training and evaluation of agents,
 because the entire loop of environment simulation, agent interaction, and policy updates can be compiled as a single
-XLA program and executed on CPUs, GPUs, or TPUs. More specifically, rhe implementations in Dopamax follow the
+XLA program and executed on CPUs, GPUs, or TPUs. More specifically, the implementations in Dopamax follow the
 Anakin Podracer architecture -- see [this paper][2] for more details.
 
 **Note that this repository is not actively maintained and is subject to breaking changes at any time.**
@@ -27,6 +27,7 @@ Anakin Podracer architecture -- see [this paper][2] for more details.
 
 - [Proximal Policy Optimization (PPO)](dopamax/agents/anakin/ppo.py)
 - [Deep Q-Network (DQN)](dopamax/agents/anakin/dqn.py)
+- [AlphaZero](dopamax/agents/anakin/alphazero.py)
 
 ## Installation
 
@@ -83,4 +84,12 @@ where `--num_episodes 100` signals that you would like to rollout the agent's po
 mean, and maximum episode reward will be logged back to W&B. If you would additionally like to render the episodes and
 have then logged back to W&B, you can provide the `--render` flag. But note that this will usually significantly slow
 down the evaluation process since environment rendering is not a pure JAX function and requires callbacks to the host.
-You should usually only use the `--render` flag with a small number of episodes.
\ No newline at end of file
+You should usually only use the `--render` flag with a small number of episodes.
+
+## See Also
+
+Some of the JAX-native packages that Dopamax relies on:
+- [sotetsuk/pgx](https://github.com/sotetsuk/pgx)
+- [deepmind/mctx](https://github.com/deepmind/mctx)
+- [deepmind/rlax](https://github.com/deepmind/rlax)
+- [google/brax](https://github.com/google/brax)
diff --git a/dopamax/agents/anakin/alphazero.py b/dopamax/agents/anakin/alphazero.py
index 0befbd7..7ab79d5 100644
--- a/dopamax/agents/anakin/alphazero.py
+++ b/dopamax/agents/anakin/alphazero.py
@@ -51,6 +51,19 @@
 
 @register("AlphaZero")
 class AlphaZero(AnakinAgent):
+    """AlphaZero agent.
+
+    Note that this implementation is slightly modified from original version of the algorithm as to adhere to the
+    Anakin architecture. It also uses a more modern version of Monte Carlo Tree Search, as implemented by
+    `mctx.muzero_policy`.
+
+    Args:
+        env: The environment to interact with. This should be a subclass of `PGXEnvironment`.
+        config: The configuration dictionary for the agent.
+
+    References:
+        https://arxiv.org/abs/1712.01815
+    """
     def __init__(self, env: Environment, config: ConfigDict):
         super().__init__(env, config)
 
diff --git a/dopamax/environments/pgx/base.py b/dopamax/environments/pgx/base.py
index 5a2f2e6..f21272a 100644
--- a/dopamax/environments/pgx/base.py
+++ b/dopamax/environments/pgx/base.py
@@ -21,6 +21,15 @@ class PGXEnvState(EnvState):
 
 @dataclass(frozen=True)
 class PGXEnvironment(Environment, ABC):
+    """Abstract base class for PGX environments.
+
+    PGX is a collection of JAX-native implementations of discrete state space environments like Chess, Shogi, and Go.
+    This class serves as a wrapper around PGX environments in order to make them conform to the dopamax environment
+    API.
+
+    References:
+        https://github.com/sotetsuk/pgx
+    """
     _pgx_env: pgx.Env
 
     def reset(self, key: PRNGKey) -> Tuple[TimeStep, PGXEnvState]: