Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions examples/scripts/sft_nemotron_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# /// script
# dependencies = [
# "trl[peft,quantization]",
# "transformers>=5.3.0",
# "transformers>=5.7.0",
# "trackio",
# "mamba_ssm==2.2.5",
# "causal_conv1d==1.5.2",
Expand All @@ -27,7 +27,7 @@

Prerequisites:

pip install "transformers>=5.3.0"
pip install "transformers>=5.7.0"
pip install --no-build-isolation mamba_ssm==2.2.5
pip install --no-build-isolation causal_conv1d==1.5.2

Expand Down Expand Up @@ -62,9 +62,6 @@


def main(script_args, training_args, model_args):
# NemotronH does not support gradient checkpointing
training_args.gradient_checkpointing = False

# Load model
model_kwargs = dict(
revision=model_args.model_revision,
Expand Down
33 changes: 33 additions & 0 deletions scripts/generate_tiny_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
LlavaNextForConditionalGeneration,
MistralConfig,
MistralForCausalLM,
NemotronHConfig,
NemotronHForCausalLM,
OPTConfig,
OPTForCausalLM,
PaliGemmaForConditionalGeneration,
Expand Down Expand Up @@ -233,6 +235,37 @@ def init_weights_tiny_model(model):
init_weights_tiny_model(model)
push_to_hub(model, tokenizer, generation_config, "tiny", suffix)

# Hybrid Mamba-Attention models
tokenizer = AutoTokenizer.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16")
generation_config = GenerationConfig.from_pretrained("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16")
config = NemotronHConfig(
vocab_size=len(tokenizer.vocab),
hidden_size=16,
num_attention_heads=4,
num_key_value_heads=2,
intermediate_size=32,
layers_block_type=["mamba", "attention"], # 2 layers: one Mamba + one Attention
mamba_num_heads=8,
mamba_head_dim=4,
mamba_n_groups=1,
ssm_state_size=16,
mamba_d_conv=4,
mamba_expand=2,
n_routed_experts=4,
num_experts_per_tok=2,
moe_intermediate_size=32,
moe_shared_expert_intermediate_size=32,
use_mamba_kernels=False, # CPU-friendly for testing
)
model = NemotronHForCausalLM(config).to(dtype=torch.bfloat16)
init_weights_tiny_model(model)
Comment thread
qgallouedec marked this conversation as resolved.
# NemotronH keeps mixer.D and mixer.A_log in float32 in the reference model; mirror that here.
# Layer 0 is the Mamba layer per layers_block_type above.
mamba_layer = model.model.layers[0]
mamba_layer.mixer.D.data = mamba_layer.mixer.D.data.float()
mamba_layer.mixer.A_log.data = mamba_layer.mixer.A_log.data.float()
push_to_hub(model, tokenizer, generation_config, "tiny")
Comment thread
cursor[bot] marked this conversation as resolved.

# Two slightly bigger models, required for vLLM testing
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
Expand Down
7 changes: 7 additions & 0 deletions tests/test_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,13 @@ class TestApplyChatTemplate(TrlTestCase):
"trl-internal-testing/tiny-LlamaForCausalLM-3",
"trl-internal-testing/tiny-MistralForCausalLM-0.1",
"trl-internal-testing/tiny-MistralForCausalLM-0.2",
pytest.param(
"trl-internal-testing/tiny-NemotronHForCausalLM",
marks=pytest.mark.skipif(
Version(transformers.__version__) < Version("5.7.0"),
reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)",
),
),
"trl-internal-testing/tiny-Phi3ForCausalLM-3",
"trl-internal-testing/tiny-Phi3ForCausalLM-3.5",
"trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
Expand Down
8 changes: 8 additions & 0 deletions tests/test_dpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
import torch
import transformers
Expand Down Expand Up @@ -171,6 +172,13 @@ class TestDPOTrainer(TrlTestCase):
"trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
"trl-internal-testing/tiny-Qwen3MoeForCausalLM",
"trl-internal-testing/tiny-GptOssForCausalLM",
pytest.param(
"trl-internal-testing/tiny-NemotronHForCausalLM",
marks=pytest.mark.skipif(
Version(transformers.__version__) < Version("5.7.0"),
reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)",
),
),
],
)
def test_train(self, model_id):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,13 @@ def test_init_with_training_arguments(self):
"trl-internal-testing/tiny-GptOssForCausalLM",
"trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
"trl-internal-testing/tiny-Qwen3MoeForCausalLM",
pytest.param(
"trl-internal-testing/tiny-NemotronHForCausalLM",
marks=pytest.mark.skipif(
Version(transformers.__version__) < Version("5.7.0"),
reason="NemotronH gradient checkpointing requires transformers>=5.7.0 (see transformers#45625)",
),
),
],
)
def test_train(self, model_id):
Expand Down
Loading