AI-Hypercomputer
diff --git a/‎src/MaxText/configs/rl.yml‎
Lines changed: 33 additions & 55 deletions b/‎src/MaxText/configs/rl.yml‎
Lines changed: 33 additions & 55 deletions
diff --git a/‎src/MaxText/evaluate_rl.py‎
Lines changed: 18 additions & 33 deletions b/‎src/MaxText/evaluate_rl.py‎
Lines changed: 18 additions & 33 deletions
@@ -61,33 +61,30 @@ loss_algo: 'grpo' # grpo or gspo-token
 # Model-Specific Overrides (examples)
 # For Llama3.1-8B:
 #   model_name: llama3.1-8b
-#   tokenizer_path: meta-llama/Llama-3.1-8B-Instruct
+#   HF tokenizer_path: meta-llama/Llama-3.1-8B-Instruct
 #
 # For Llama3.1-70B with Pathways:
 #   model_name: llama3.1-70b
-#   tokenizer_path: meta-llama/Llama-3.1-70B-Instruct
+#   HF tokenizer_path: meta-llama/Llama-3.1-70B-Instruct
 
-async_checkpointing: 'false'
-checkpoint_period: 5
-skip_jax_distributed_system: True
+# ====== MaxText configs ======
 weight_dtype: 'bfloat16'
 attention: 'dot_product'
 remat_policy: 'custom'
 decoder_layer_input: 'offload'
 query_proj: 'offload'
 key_proj: 'offload'
 value_proj: 'offload'
-# for vLLM
-hf_model_name: 'meta-llama/Llama-3.1-70B-Instruct'
+
 
 # ====== Training ======
 batch_size: 1
 # Increase `batch_size` and `MAX_STEPS` for better results.
-# num_batches = 3738
-num_batches = 4  # 200
+# num_batches: 3738
+num_batches: 4  # 200
 # Keep `num_test_batches` low so that evaluation runs quickly. It can be
 # increased to a max. of 330 (if batch size is 4).
-num_test_batches = 5  # 200
+num_test_batches: 5  # 200
 train_fraction: 1.0
 
 eval_interval: 10  # this doesn't matter if `TRAIN_FRACTION = 1.0`.
@@ -96,6 +93,20 @@ num_epochs: 1  # can potentially train for more epochs
 
 gradient_clipping_threshold: 0.1
 
+# ====== Evaluation ======
+generation_configs:
+  greedy:
+    temperature: 0.01
+    top_k: 1
+    top_p: 1.0
+  standard:
+    temperature: 0.7
+    top_k: 50
+    top_p: 0.95
+  liberal:
+    temperature: 0.85
+    top_k: 2000
+    top_p: 1.0
 
 # greedy
 eval_temperature: 0.01
@@ -108,13 +119,9 @@ eval_top_p: 1.0
 
 
 # ====== Inference ======
-# Important to keep a high-ish temperature for varied, diverse responses during
-# training.
-decode_sampling_temperature: 0.9
-decode_sampling_top_k: 50
-decode_sampling_nucleus_p: 1.0
-
 # for vLLM
+hf_model_name: None
+
 # === Generation during GRPO training ===
 # max Lengths for prompt and completion
 max_prefill_predict_length: 256
@@ -123,13 +130,15 @@ kv_cache_buffer: 256
 hbm_utilization_vllm: 0.72
 swap_space_vllm_gb: 2
 # Generation Configuration During Training
+# Important to keep a high-ish temperature for varied, diverse responses during
+# training.
 decode_sampling_temperature: 0.9
-decode_sampling_top_p: 1.0
 decode_sampling_top_k: 50
+decode_sampling_nucleus_p: 1.0
 
 # ====== Checkpoint Configuration ======
 enable_checkpointing: True
-async_checkpointing: True
+async_checkpointing: False
 checkpoint_period: 50
 max_num_checkpoints_to_keep: 10
 
@@ -162,40 +171,9 @@ template: |
   <start_of_turn>model
 
 
-# TODO: fix this
-# Dataset Configuration
-dataset_type: hf  # Huggingface input pipeline
-hf_path: 'gsm8k'
-hf_data_split: 'main'
-hf_data_files: 'train'
-
-
-# Pathways Inference Configuration
-# For multi-host/multi-slice setups
-use_pathways_reshard: False
-inference_devices_per_replica: 4
-inference_replicas: 1
-
-# Tokenizer Settings
-add_bos: False
-add_eos: False
-return_log_prob: True
-
-# Performance and Memory
-weight_dtype: bfloat16
-dtype: bfloat16
-
-# Splash Attention Block Sizes
-# Tuned for GRPO workloads
-sa_block_q: 128
-sa_block_kv: 128
-sa_block_kv_compute: 128
-sa_block_q_dkv: 128
-sa_block_kv_dkv: 128
-sa_block_kv_dkv_compute: 128
-sa_block_q_dq: 128
-sa_block_kv_dq: 128
-sa_use_fused_bwd_kernel: False
-sa_q_layout: "HEAD_DIM_MINOR"
-sa_k_layout: "HEAD_DIM_MINOR"
-sa_v_layout: "HEAD_DIM_MINOR"
+# # TODO(@mazumdera): fix this
+# # Dataset Configuration
+# dataset_type: hf  # Huggingface input pipeline
+# hf_path: 'gsm8k'
+# hf_data_split: 'main'
+# hf_data_files: 'train'
@@ -18,6 +18,7 @@
 import os
 from pprint import pprint
 import re
+
 import sys
 
 from datetime import datetime
@@ -51,11 +52,6 @@
 from MaxText import rl_utils
 
 # ## Evaluate
-#
-#
-# Before we train the model, let's evaluate the model on the test set so we can
-# see the improvement post training.
-#
 # We evaluate it in two ways:
 #
 # **Quantitative**
@@ -76,13 +72,10 @@
 
 
 def generate_responses(
-    mt_config,
+    tmvp_config,
     prompts,
     rl_cluster,
     num_passes=1,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.95,
 ):
   """
   Generate responses for a batch of prompts across multiple passes.
@@ -104,15 +97,15 @@ def generate_responses(
     responses = rl_cluster.rollout.generate(
         prompts,
         rollout_config=RolloutConfig(
-            max_tokens_to_generate=mt_config.max_target_length,
-            temperature=mt_config.eval_temperature,
-            top_k=mt_config.eval_top_k,
-            top_p=mt_config.eval_top_p,
+            max_tokens_to_generate=tmvp_config.max_target_length,
+            temperature=tmvp_config.eval_temperature,
+            top_k=tmvp_config.eval_top_k,
+            top_p=tmvp_config.eval_top_p,
         ),
     )
     responses = responses.text
 
-    if mt_config.debug:
+    if tmvp_config.debug:
       print(f"Pass {p+1}/{num_passes}, responses: {responses}")
 
     for idx, response in enumerate(responses):
@@ -121,7 +114,7 @@ def generate_responses(
   return multiple_call_responses
 
 
-def score_responses(mt_config, question, responses, answer):
+def score_responses(tmvp_config, question, responses, answer):
   """
   Score a set of responses for a single question.
 
@@ -133,10 +126,10 @@ def score_responses(mt_config, question, responses, answer):
   Returns:
       Tuple of (is_correct, is_partially_correct, has_correct_format)
   """
-  match_format = rl_utils.get_match_format_regex(mt_config)
-  match_numbers = rl_utils.get_match_numbers_regex(mt_config)
+  match_format = rl_utils.get_match_format_regex(tmvp_config)
+  match_numbers = rl_utils.get_match_numbers_regex(tmvp_config)
 
-  if DEBUG:
+  if tmvp_config.debug:
     print("========================================")
     print(f"Evaluation Question: {question}")
     print(f"Evaluation Answer: {answer}")
@@ -151,7 +144,7 @@ def score_responses(mt_config, question, responses, answer):
     # Extract numerical response
     extracted_response = guess.group(1) if (guess := match_numbers.search(response)) is not None else "-1000000"
 
-    if DEBUG:
+    if tmvp_config.debug:
       print(f"Evaluation extracted_response: {extracted_response}")
 
     # Check exact correctness
@@ -164,7 +157,7 @@ def score_responses(mt_config, question, responses, answer):
       if 0.9 <= ratio <= 1.1:
         is_partially_correct = True
     except Exception as e:
-      if DEBUG:
+      if tmvp_config.debug:
         print(f"Evaluation Exception: {e}")
         print("SKIPPED")
 
@@ -180,12 +173,9 @@ def score_responses(mt_config, question, responses, answer):
 
 
 def evaluate(
-    mt_config,
+    tmvp_config,
     dataset,
     rl_cluster,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.95,
     num_passes=1,
     corr_lst=False,
     make_lst=False,
@@ -194,11 +184,9 @@ def evaluate(
   Computes accuracy and percentage of outputs matching the format.
 
   Args:
+      tmvp_config: Configuration object
       dataset: The evaluation dataset
-      rl_cluster: Model cluster for generation
-      temperature: Sampling temperature
-      top_k: Top-k sampling parameter
-      top_p: Top-p sampling parameter
+      rl_cluster: Model cluster for generation.
       num_passes: Number of generation passes
       corr_lst: If True, only include correct responses in the list
       make_lst: If True, return a list of (question, answer, responses)
@@ -219,19 +207,16 @@ def evaluate(
 
     # Generate responses for all prompts in the batch
     multiple_call_responses = generate_responses(
-        mt_config=mt_config,
+        tmvp_config=tmvp_config,
         prompts=prompts,
         rl_cluster=rl_cluster,
         num_passes=num_passes,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
     )
 
     # Score each question-answer pair
     for question, responses, answer in zip(questions, multiple_call_responses, answers):
       is_correct, is_partially_correct, has_correct_format = score_responses(
-          mt_config=mt_config,
+          tmvp_config=tmvp_config,
           question=question,
           responses=responses,
           answer=answer,