WIP src/MaxText/rl/rl_trainer.py, delete grpo_runner

A9isha · A9isha · commit d6e053a1ec6a · 2025-10-31T19:48:58.000Z
diff --git a/src/MaxText/configs/grpo.yml b/src/MaxText/configs/grpo.yml
@@ -12,14 +12,109 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# GRPO Configuration
-# This config consolidates common parameters for GRPO training across different model sizes
+# RL Configuration
+# This config consolidates common parameters for RL training across different model sizes
 
 base_config: "base.yml"
 
-use_grpo: True
-train_data_columns: 'prompt'
+# ====== Hardware =====
+trainer_devices_fraction: 0.5
+sampler_devices_fraction: 0.5
+chips_per_vm: 4  # depends on hardware, for v5p this is 4
 
+# ====== Debug ======
+debug: True
+
+# ====== Reproducibility ======
+data_shuffle_seed: 42
+loss_algo: 'grpo' # grpo or gspo-token
+
+# ====== Checkpoint saving ======
+save_interval_steps: 500
+max_to_keep: 4
+
+# ====== GRPO ======
+# === Generation during GRPO training ===
+max_prompt_length: 256
+total_generation_steps: 768
+
+# The number of times the policy generates multiple responses for a given prompt
+# within a single training step. This corresponds to `G` in Algorithm 1 in the
+# paper. The "group" in GRPO comes from here.
+num_generations: 2
+
+# === other GRPO configs ===
+# The number of iterations per batch (𝜇 in GRPO algo 1).
+num_iterations: 1
+
+# The coefficient for the KL divergence penalty (𝛽) in the GRPO loss function.
+# Important to keep a high enough value for this, otherwise, the KL divergence
+# can increase unchecked.
+beta: 0.08
+# Epsilon value for clipping (𝜀 in GRPO loss in paper). Similar to PPO, for
+# stable updates.
+epsilon: 0.2
+
+# ====== Training ======
+
+batch_size: 1
+# Increase `batch_size` and `MAX_STEPS` for better results.
+# NUM_BATCHES = 3738
+NUM_BATCHES = 4  # 200
+# Keep `NUM_TEST_BATCHES` low so that evaluation runs quickly. It can be
+# increased to a max. of 330 (if batch size is 4).
+NUM_TEST_BATCHES = 5  # 200
+
+EVAL_EVERY_N_STEPS = 10  # this doesn't matter if `TRAIN_FRACTION = 1.0`.
+NUM_EPOCHS = 1  # can potentially train for more epochs
+
+
+# === AdamW, warmup, cosine scheduler ===
+LEARNING_RATE = 3e-6
+B1 = 0.9
+B2 = 0.99
+WEIGHT_DECAY = 0.1
+# == Cosine decay with warmup scheduler ==
+# Linearly increase learning rate from 0. to 5e-6 in the first 10% training
+# steps, and then gradually decrease the learning rate to 0 using cosine
+# scheduler.
+WARMUP_STEPS = int(0.1 * MAX_STEPS)
+# == Grad clipping ==
+# Grad clipping to prevent large gradients. Found this
+# important to keep KL divergence in check.
+MAX_GRAD_NORM = 0.1
+
+
+# ====== Inference ======
+# Important to keep a high-ish temperature for varied, diverse responses during
+# training.
+# greedy search
+temperature: 0.01
+top_p: 1.0
+top_k: 1
+    
+# # some randomness
+# "standard": {"temperature": 0.7, "top_k": 50, "top_p": 0.95},
+# # liberal
+# "liberal": {"temperature": 0.85, "top_k": 2000, "top_p": 1.0},
+
+TRAINER_DEVICES_FRACTION = 0.5
+SAMPLER_DEVICES_FRACTION = 0.5
+HBM_UTILIZATION_VLLM = 0.72
+SWAP_SPACE_VLLM_GB = 2
+
+
+# ====== Reward ======
+REWARD_EXACT_FORMAT_MATCH = 3.0
+REWARD_WHITE_SPACE_FORMAT_MATCH = 1.5
+REWARD_PARTIAL_FORMAT_MATCH = 0.5
+REWARD_RATIO_GUESS_TO_ANSWER_HIGH = 0.5
+REWARD_RATIO_GUESS_TO_ANSWER_LOW = 0.25
+PENALTY_INCORRECT_FORMAT = -0.5
+PENALTY_INCORRECT_ANSWER = -1.0
+
+
+# TODO: fix this
 # Dataset Configuration
 dataset_type: hf  # Huggingface input pipeline
 hf_path: 'gsm8k'
@@ -56,8 +151,6 @@ decode_sampling_top_k: 50
 # Training Loop Configuration  
 steps: 100
 per_device_batch_size: 1
-eval_interval: 10
-eval_steps: 5
 
 # Checkpoint Configuration
 enable_checkpointing: True
diff --git a/src/MaxText/examples/grpo_llama3_1_70b_demo_pw.py b/src/MaxText/examples/grpo_llama3_1_70b_demo_pw.py
@@ -15,7 +15,7 @@
 # pylint: disable=bare-except, consider-using-generator
 """ 
 DEPRECATED: This file is deprecated and kept for reference only.
-Please use the new unified CLI interface: grpo_runner.py
+Please use the new unified CLI interface: rl_trainer.py
 
 See GRPO_README.md for migration guide and usage examples.
 
diff --git a/src/MaxText/examples/grpo_llama3_1_8b_demo.ipynb b/src/MaxText/examples/grpo_llama3_1_8b_demo.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# GRPO Llama3.1-8B Demo: Direct Function Call\n",
     "\n",
-    "This notebook demonstrates GRPO training by directly calling the `grpo_train` function from `grpo_tunix_trainer.py`.\n",
+    "This notebook demonstrates GRPO training by directly calling the `rl_train` function from `rl_trainer.py`.\n",
     "\n",
     "## What is GRPO?\n",
     "\n",
@@ -16,7 +16,7 @@
     "3. Calculating relative advantages to update the policy\n",
     "\n",
     "\n",
-    "This notebook imports and calls the `grpo_train` function \n",
+    "This notebook imports and calls the `rl_train` function \n",
     "\n",
     "## Hardware Requirements\n",
     "\n",
@@ -115,7 +115,7 @@
     "\n",
     "# Import required modules\n",
     "from MaxText import pyconfig\n",
-    "from MaxText.experimental.rl.grpo_tunix_trainer import grpo_train\n",
+    "from MaxText.rl.rl_trainer import rl_train\n",
     "\n",
     "print(\"✅ Successfully imported GRPO training function\")\n",
     "print(f\"📁 MaxText path: {maxtext_path}\")\n",
@@ -145,8 +145,6 @@
     "    \"num_generations=2\",\n",
     "    \"grpo_beta=0.08\",\n",
     "    \"grpo_epsilon=0.2\",\n",
-    "    \"trainer_devices_fraction=0.5\",\n",
-    "    \"sampler_devices_fraction=0.5\",\n",
     "    \"chips_per_vm=4\"\n",
     "]\n",
     "\n",
@@ -168,14 +166,13 @@
    "source": [
     "# Execute GRPO training directly\n",
     "try:\n",
-    "    # Call the grpo_train function\n",
-    "    grpo_trainer, rl_cluster = grpo_train(config)\n",
+    "    # Call the rl_train function\n",
+    "    grpo_trainer, rl_cluster = rl_train(config)\n",
     "    \n",
     "    print(\"\\n\" + \"=\"*80)\n",
     "    print(\"✅ GRPO Training Completed Successfully!\")\n",
     "    print(\"=\"*80)\n",
-    "    print(f\"📁 Checkpoints saved to: {config.base_output_directory}/checkpoints\")\n",
-    "    print(f\"📊 Logs available in: {config.base_output_directory}/logs\")\n",
+    "    print(f\"📁 Checkpoints and logs saved to: {config.base_output_directory}\")\n",
     "    print(f\"🎯 Final model ready for inference!\")\n",
     "    \n",
     "except Exception as e:\n",
diff --git a/src/MaxText/examples/grpo_llama3_1_8b_demo.py b/src/MaxText/examples/grpo_llama3_1_8b_demo.py
@@ -15,7 +15,7 @@
 # pylint: disable=bare-except, consider-using-generator
 """ 
 DEPRECATED: This file is deprecated and kept for reference only.
-Please use the new unified CLI interface: grpo_runner.py
+Please use the new unified CLI interface: rl_trainer.py
 
 See GRPO_README.md for migration guide and usage examples.
 
diff --git a/src/MaxText/examples/grpo_llama3_1_8b_demo_pw.py b/src/MaxText/examples/grpo_llama3_1_8b_demo_pw.py
@@ -15,7 +15,7 @@
 # pylint: disable=bare-except, consider-using-generator
 """ 
 DEPRECATED: This file is deprecated and kept for reference only.
-Please use the new unified CLI interface: grpo_runner.py
+Please use the new unified CLI interface: rl_trainer.py
 
 See GRPO_README.md for migration guide and usage examples.
 
diff --git a/src/MaxText/examples/grpo_runner.py b/src/MaxText/examples/grpo_runner.py
diff --git a/src/MaxText/rl/rl_trainer.py b/src/MaxText/rl/rl_trainer.py