Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 71 additions & 13 deletions 3.test_cases/megatron/nemo/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
FROM nvcr.io/nvidia/nemo:25.07.00
ARG GDRCOPY_VERSION=v2.5
ARG EFA_INSTALLER_VERSION=1.47.0
# ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws # OFI NCCL already packaged into EFA installation (/opt/amazon/ofi-nccl) cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
ARG NCCL_VERSION=v2.27.7-1
ARG NCCL_TESTS_VERSION=v2.16.9
ARG TRANSFORMERS_VERSION=4.56.1
FROM nvcr.io/nvidia/nemo:26.02
ARG GDRCOPY_VERSION=v2.5.2
ARG EFA_INSTALLER_VERSION=1.48.0
# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for
# EFA installer >=1.47.0. Cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
ARG AWS_OFI_NCCL_VERSION=v1.19.0
ARG NCCL_VERSION=v2.30.4-1
ARG NCCL_TESTS_VERSION=v2.18.3
ARG TRANSFORMERS_VERSION=4.57.6
# Pin megatron-core to the version that NeMo 2.7.x is API-compatible with.
# nemo:26.02 ships megatron-core 0.16.1 at /opt/Megatron-Bridge/3rdparty/Megatron-LM/
# but bundled NeMo 2.7.1 calls APIs 0.16.x removed (get_megatron_optimizer
# kwargs no_weight_decay_cond/scale_lr_cond/lr_mult) and imports submodules
# 0.16.x dropped (megatron.core.dist_checkpointing.strategies.tensorstore).
# 0.15.3 is the latest 0.15.x release on PyPI / GitHub.
ARG MEGATRON_CORE_VERSION=core_v0.15.3


ARG OPEN_MPI_PATH=/opt/amazon/openmpi # Open MPI already packaged into EFA installation (/opt/amazon/openmpi)
Expand Down Expand Up @@ -38,8 +47,13 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
libtool \
openssh-client \
openssh-server \
vim \
&& apt autoremove -y
vim
# NOTE: deliberately no `apt autoremove` here. The nemo:26.02 base image
# ships several CUDA add-on libraries (libcusparseLt0, libcudnn-frontend,
# etc.) that apt sees as orphaned because no installed Debian package
# Depends: them — they're loaded via dlopen by torch / transformer_engine
# at runtime. autoremove deletes them and torch.import then crashes with
# `libcusparseLt.so.0: cannot open shared object file`.

RUN mkdir -p /var/run/sshd && \
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
Expand All @@ -53,8 +67,37 @@ RUN rm -rf /root/.ssh/ \
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config

ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
# NGC images install the OFI NCCL plugin via the libnccl-ofi-ngc-v2 package
# from the EFA installer, which lands at /opt/amazon/aws-ofi-nccl/ rather than
# /opt/amazon/ofi-nccl/ used on stock Ubuntu. Cover both for portability.
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/aws-ofi-nccl/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
# Prepend /opt/venv/bin so every `python`/`pip` resolves to the uv-managed
# venv where nemo, megatron, nemo_run, etc. live. The nemo:26.02 image's
# default PATH puts /usr/bin BEFORE /opt/venv/bin, which breaks torchelastic
# worker spawn: `ft_launcher` finds /usr/bin/python (no nemo_run) instead of
# /opt/venv/bin/python and crashes with `ModuleNotFoundError: nemo_run`.
ENV PATH=/opt/venv/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

# nemo:26.02 ships libcusparseLt0 (libcusparselt0-cuda-13) into the nested
# /usr/lib/x86_64-linux-gnu/libcusparseLt/13/ directory but does NOT create
# the libcusparseLt.so.0 SONAME symlink and does NOT add an ld.so.conf.d
# entry. torch dlopens libcusparseLt.so.0 at import time and crashes:
# ImportError: libcusparseLt.so.0: cannot open shared object file
# Register the dir with ld.so AND let ldconfig create the SONAME symlink.
RUN echo /usr/lib/x86_64-linux-gnu/libcusparseLt/13 > /etc/ld.so.conf.d/000_libcusparselt.conf \
&& ldconfig

# Pin the bundled megatron-core to MEGATRON_CORE_VERSION (see ARG above for
# rationale). Replace the in-place 0.16.1 source tree at
# /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ with the 0.15.3
# release. Pip's site-packages copy is updated by venv.sh on the host; this
# step covers the copy that runs inside the container.
RUN git clone -b ${MEGATRON_CORE_VERSION} --depth 1 \
https://github.com/NVIDIA/Megatron-LM.git /tmp/megatron-lm \
&& rm -rf /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core \
&& cp -r /tmp/megatron-lm/megatron/core \
/opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/ \
&& rm -rf /tmp/megatron-lm

#################################################
## Install NVIDIA GDRCopy
Expand All @@ -72,6 +115,17 @@ ENV PATH /opt/gdrcopy/bin:$PATH

#################################################
## Install EFA installer
##
## The base nemo:26.02 image ships partial EFA components (efa-profile,
## libfabric1-aws, openmpi*-aws, libnccl-ofi-ngc-v2). Their dpkg state
## doesn't match the on-disk files, so the EFA installer's verify step
## refuses to upgrade with "ld.so.conf.d/000_efa.conf is installed by
## efa-profile package but doesn't exist". Purge them first.
RUN dpkg --purge --force-all \
efa-profile libfabric1-aws libfabric1-aws-dbg \
openmpi40-aws openmpi50-aws \
libnccl-ofi-ngc-v2 libnccl-ofi-ngc-v2-dbgsym 2>/dev/null || true

RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
Expand Down Expand Up @@ -132,5 +186,9 @@ ENV OMPI_MCA_pml=^cm,ucx \
## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
ENV PMIX_MCA_gds=hash

# Debug: Verify OFI NCCL and OPENMPI installation
RUN ls -la /opt/amazon/efa/lib/ && ls -la /opt/amazon/ofi-nccl/lib/ && ls -la /opt/amazon/openmpi/lib/
# Verify EFA / OFI NCCL / OpenMPI installation. NGC's libnccl-ofi-ngc-v2
# package installs to /opt/amazon/aws-ofi-nccl, while stock EFA installs
# to /opt/amazon/ofi-nccl — accept either.
RUN ls -la /opt/amazon/efa/lib/ \
&& (ls -la /opt/amazon/aws-ofi-nccl/lib/ || ls -la /opt/amazon/ofi-nccl/lib/) \
&& ls -la /opt/amazon/openmpi/lib/
42 changes: 15 additions & 27 deletions 3.test_cases/megatron/nemo/kubernetes/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
FROM nvcr.io/nvidia/nemo:25.04.01
ARG GDRCOPY_VERSION=v2.4.1
ARG EFA_INSTALLER_VERSION=1.37.0
ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
ARG NCCL_VERSION=v2.23.4-1
ARG NCCL_TESTS_VERSION=v2.13.10
ARG TRANSFORMERS_VERSION=4.48.1
FROM nvcr.io/nvidia/nemo:26.02
ARG GDRCOPY_VERSION=v2.5.2
ARG EFA_INSTALLER_VERSION=1.48.0
# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for
# EFA installer >=1.47.0; the explicit source build below is no longer required.
ARG AWS_OFI_NCCL_VERSION=v1.19.0
ARG NCCL_VERSION=v2.30.4-1
ARG NCCL_TESTS_VERSION=v2.18.3
ARG TRANSFORMERS_VERSION=4.57.6

ARG OPEN_MPI_PATH=/opt/amazon/openmpi

Expand Down Expand Up @@ -52,7 +54,7 @@ RUN rm -rf /root/.ssh/ \
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config

ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

######################
Expand Down Expand Up @@ -87,25 +89,11 @@ RUN cd $HOME \


###################################################
## Install AWS-OFI-NCCL plugin
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
#Switch from sh to bash to allow parameter expansion
SHELL ["/bin/bash", "-c"]
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz

SHELL ["/bin/sh", "-c"]
## AWS OFI NCCL plugin: bundled in /opt/amazon/ofi-nccl by the EFA
## installer (>=1.47.0). The previous explicit source build is no longer
## needed; verify the installation here.
RUN ls -la /opt/amazon/ofi-nccl/lib/ || \
(echo "AWS OFI NCCL not found in EFA installation; check EFA_INSTALLER_VERSION" && exit 1)

###################################################
RUN rm -rf /var/lib/apt/lists/*
Expand Down
18 changes: 9 additions & 9 deletions 3.test_cases/megatron/nemo/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ Before you begin, ensure you have the following:

## 1. Building the AWS-Optimized NeMo Container for EFA Enabled Instances

**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:25.04.01`) is enhanced with AWS-specific optimizations for EFA support.
**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:26.02`) is enhanced with AWS-specific optimizations for EFA support.

### Build the Docker Image

Expand Down Expand Up @@ -359,7 +359,7 @@ cd data-processing/

## 5. Launching NeMo Training Jobs

> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:25.04.01`) will work fine and you can omit the `--container_image` parameter.
> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:26.02`) will work fine and you can omit the `--container_image` parameter.

### Overview

Expand Down Expand Up @@ -388,7 +388,7 @@ The repository provides multiple training scenarios to meet different needs:
| `--gpus` | GPU type (e.g., L40S, H100, A10G) | L40S |
| `--gpu-devices` | Number of GPUs per node | 4 |
| `--efa-devices` | Number of EFA devices per node | None |
| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:25.04.01 |
| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:26.02 |
| `--env_vars_file` | JSON file with environment variables | env_vars.json |
| `--pvc_name` | Name of the Persistent Volume Claim to use | fsx-claim |
| `--pvc_mount_path` | Path where the PVC should be mounted in the container | /mnt/nemo |
Expand Down Expand Up @@ -467,7 +467,7 @@ python pretrain_mock_dataset.py \
--nodes 1 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo
Expand Down Expand Up @@ -496,7 +496,7 @@ python finetune_default_dataset.py \
--nodes 1 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo
Expand All @@ -509,7 +509,7 @@ python finetune_default_dataset.py \
--nodes 1 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo \
Expand All @@ -523,7 +523,7 @@ python finetune_default_dataset.py \
--nodes 1 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo \
Expand Down Expand Up @@ -554,7 +554,7 @@ python finetune_custom_dataset.py \
--nodes 1 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo \
Expand All @@ -570,7 +570,7 @@ python finetune_custom_dataset.py \
--nodes 2 \
--gpus L40S \
--gpu-devices 4 \
--container_image nvcr.io/nvidia/nemo:25.04.01 \
--container_image nvcr.io/nvidia/nemo:26.02 \
--env_vars_file env_vars.json \
--pvc_name fsx-claim \
--pvc_mount_path /mnt/nemo \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
restartPolicy: Never
containers:
- name: nemo-processing
image: nvcr.io/nvidia/nemo:25.04.01
image: nvcr.io/nvidia/nemo:26.02
command: ["/bin/bash"]
args: ["-c", "sleep infinity"]
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# function in the CustomDataModule class to match your dataset's structure
# =============================================================================

# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo


def get_parser():
Expand All @@ -32,7 +32,7 @@ def get_parser():
parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
Expand Down Expand Up @@ -109,7 +109,7 @@ def skypilot_executor(
gpus: str = "L40S",
efa_devices: Optional[int] = None,
custom_mounts: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:24.12",
container_image: str = "nvcr.io/nvidia/nemo:26.02",
env_vars_file: str = "env_vars.json",
pvc_name: str = "nemo-runs",
lora_enabled: bool = False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from nemo.utils import logging


# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo


def get_parser():
Expand All @@ -26,7 +26,7 @@ def get_parser():
parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
Expand Down Expand Up @@ -76,7 +76,7 @@ def skypilot_executor(
gpus: str = "L40S",
efa_devices: Optional[int] = None,
custom_mounts: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:24.12",
container_image: str = "nvcr.io/nvidia/nemo:26.02",
env_vars_file: str = "env_vars.json",
pvc_name: str = "nemo-runs",
lora_enabled: bool = False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from nemo.utils import logging
from datasets import load_dataset

# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo


def small_llama_cfg() -> llm.GPTConfig:
Expand All @@ -36,7 +36,7 @@ def get_parser():
parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
Expand All @@ -54,7 +54,7 @@ def skypilot_executor(
gpus: str = "L40S",
efa_devices: Optional[int] = None,
custom_mounts: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:24.12",
container_image: str = "nvcr.io/nvidia/nemo:26.02",
env_vars_file: str = "env_vars.json",
pvc_name: str = "nemo-runs"
) -> run.SkypilotExecutor:
Expand Down
Loading
Loading