diff --git a/examples/scripts/sft_nemotron_3.py b/examples/scripts/sft_nemotron_3.py index c3f6aa0c938..8fbf6671636 100644 --- a/examples/scripts/sft_nemotron_3.py +++ b/examples/scripts/sft_nemotron_3.py @@ -15,7 +15,7 @@ # /// script # dependencies = [ # "trl[peft,quantization]", -# "transformers>=5.3.0", +# "transformers>=5.7.0", # "trackio", # "mamba_ssm==2.2.5", # "causal_conv1d==1.5.2", @@ -27,7 +27,7 @@ Prerequisites: - pip install "transformers>=5.3.0" + pip install "transformers>=5.7.0" pip install --no-build-isolation mamba_ssm==2.2.5 pip install --no-build-isolation causal_conv1d==1.5.2 @@ -62,9 +62,6 @@ def main(script_args, training_args, model_args): - # NemotronH does not support gradient checkpointing - training_args.gradient_checkpointing = False - # Load model model_kwargs = dict( revision=model_args.model_revision, diff --git a/scripts/generate_tiny_models.py b/scripts/generate_tiny_models.py index 9090fd346af..5091b7127bb 100644 --- a/scripts/generate_tiny_models.py +++ b/scripts/generate_tiny_models.py @@ -60,6 +60,8 @@ LlavaNextForConditionalGeneration, MistralConfig, MistralForCausalLM, + NemotronHConfig, + NemotronHForCausalLM, OPTConfig, OPTForCausalLM, PaliGemmaForConditionalGeneration, @@ -233,6 +235,37 @@ def init_weights_tiny_model(model): init_weights_tiny_model(model) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) +# Hybrid Mamba-Attention models +tokenizer = AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16") +generation_config = GenerationConfig.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16") +config = NemotronHConfig( + vocab_size=len(tokenizer.vocab), + hidden_size=16, + num_attention_heads=4, + num_key_value_heads=2, + intermediate_size=32, + layers_block_type=["mamba", "attention"], # 2 layers: one Mamba + one Attention + mamba_num_heads=8, + mamba_head_dim=4, + mamba_n_groups=1, + ssm_state_size=16, + mamba_d_conv=4, + mamba_expand=2, + n_routed_experts=4, + num_experts_per_tok=2, + moe_intermediate_size=32, + moe_shared_expert_intermediate_size=32, + use_mamba_kernels=False, # CPU-friendly for testing +) +model = NemotronHForCausalLM(config).to(dtype=torch.bfloat16) +init_weights_tiny_model(model) +# NemotronH keeps mixer.D and mixer.A_log in float32 in the reference model; mirror that here. +# Layer 0 is the Mamba layer per layers_block_type above. +mamba_layer = model.model.layers[0] +mamba_layer.mixer.D.data = mamba_layer.mixer.D.data.float() +mamba_layer.mixer.A_log.data = mamba_layer.mixer.A_log.data.float() +push_to_hub(model, tokenizer, generation_config, "tiny") + # Two slightly bigger models, required for vLLM testing tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct") generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-32B-Instruct") diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py index 3498a8f1850..accd47ef750 100644 --- a/tests/test_data_utils.py +++ b/tests/test_data_utils.py @@ -542,6 +542,13 @@ class TestApplyChatTemplate(TrlTestCase): "trl-internal-testing/tiny-LlamaForCausalLM-3", "trl-internal-testing/tiny-MistralForCausalLM-0.1", "trl-internal-testing/tiny-MistralForCausalLM-0.2", + pytest.param( + "trl-internal-testing/tiny-NemotronHForCausalLM", + marks=pytest.mark.skipif( + Version(transformers.__version__) < Version("5.7.0"), + reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)", + ), + ), "trl-internal-testing/tiny-Phi3ForCausalLM-3", "trl-internal-testing/tiny-Phi3ForCausalLM-3.5", "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index 5000c0f449a..238c37145b3 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pytest import torch import transformers @@ -171,6 +172,13 @@ class TestDPOTrainer(TrlTestCase): "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", "trl-internal-testing/tiny-Qwen3MoeForCausalLM", "trl-internal-testing/tiny-GptOssForCausalLM", + pytest.param( + "trl-internal-testing/tiny-NemotronHForCausalLM", + marks=pytest.mark.skipif( + Version(transformers.__version__) < Version("5.7.0"), + reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)", + ), + ), ], ) def test_train(self, model_id): diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index 2c10d2b7e94..00ce9021174 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -358,6 +358,13 @@ def test_init_with_training_arguments(self): "trl-internal-testing/tiny-GptOssForCausalLM", "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + pytest.param( + "trl-internal-testing/tiny-NemotronHForCausalLM", + marks=pytest.mark.skipif( + Version(transformers.__version__) < Version("5.7.0"), + reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)", + ), + ), ], ) def test_train(self, model_id):