diff --git a/3.test_cases/megatron/nemo/Dockerfile b/3.test_cases/megatron/nemo/Dockerfile index e10cbfc57..21d8637cf 100644 --- a/3.test_cases/megatron/nemo/Dockerfile +++ b/3.test_cases/megatron/nemo/Dockerfile @@ -1,10 +1,19 @@ -FROM nvcr.io/nvidia/nemo:25.07.00 -ARG GDRCOPY_VERSION=v2.5 -ARG EFA_INSTALLER_VERSION=1.47.0 -# ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws # OFI NCCL already packaged into EFA installation (/opt/amazon/ofi-nccl) cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html -ARG NCCL_VERSION=v2.27.7-1 -ARG NCCL_TESTS_VERSION=v2.16.9 -ARG TRANSFORMERS_VERSION=4.56.1 +FROM nvcr.io/nvidia/nemo:26.02 +ARG GDRCOPY_VERSION=v2.5.2 +ARG EFA_INSTALLER_VERSION=1.48.0 +# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for +# EFA installer >=1.47.0. Cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html +ARG AWS_OFI_NCCL_VERSION=v1.19.0 +ARG NCCL_VERSION=v2.30.4-1 +ARG NCCL_TESTS_VERSION=v2.18.3 +ARG TRANSFORMERS_VERSION=4.57.6 +# Pin megatron-core to the version that NeMo 2.7.x is API-compatible with. +# nemo:26.02 ships megatron-core 0.16.1 at /opt/Megatron-Bridge/3rdparty/Megatron-LM/ +# but bundled NeMo 2.7.1 calls APIs 0.16.x removed (get_megatron_optimizer +# kwargs no_weight_decay_cond/scale_lr_cond/lr_mult) and imports submodules +# 0.16.x dropped (megatron.core.dist_checkpointing.strategies.tensorstore). +# 0.15.3 is the latest 0.15.x release on PyPI / GitHub. +ARG MEGATRON_CORE_VERSION=core_v0.15.3 ARG OPEN_MPI_PATH=/opt/amazon/openmpi # Open MPI already packaged into EFA installation (/opt/amazon/openmpi) @@ -38,8 +47,13 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \ libtool \ openssh-client \ openssh-server \ - vim \ - && apt autoremove -y + vim +# NOTE: deliberately no `apt autoremove` here. The nemo:26.02 base image +# ships several CUDA add-on libraries (libcusparseLt0, libcudnn-frontend, +# etc.) that apt sees as orphaned because no installed Debian package +# Depends: them — they're loaded via dlopen by torch / transformer_engine +# at runtime. autoremove deletes them and torch.import then crashes with +# `libcusparseLt.so.0: cannot open shared object file`. RUN mkdir -p /var/run/sshd && \ sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ @@ -53,8 +67,37 @@ RUN rm -rf /root/.ssh/ \ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH -ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH +# NGC images install the OFI NCCL plugin via the libnccl-ofi-ngc-v2 package +# from the EFA installer, which lands at /opt/amazon/aws-ofi-nccl/ rather than +# /opt/amazon/ofi-nccl/ used on stock Ubuntu. Cover both for portability. +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/aws-ofi-nccl/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH +# Prepend /opt/venv/bin so every `python`/`pip` resolves to the uv-managed +# venv where nemo, megatron, nemo_run, etc. live. The nemo:26.02 image's +# default PATH puts /usr/bin BEFORE /opt/venv/bin, which breaks torchelastic +# worker spawn: `ft_launcher` finds /usr/bin/python (no nemo_run) instead of +# /opt/venv/bin/python and crashes with `ModuleNotFoundError: nemo_run`. +ENV PATH=/opt/venv/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH + +# nemo:26.02 ships libcusparseLt0 (libcusparselt0-cuda-13) into the nested +# /usr/lib/x86_64-linux-gnu/libcusparseLt/13/ directory but does NOT create +# the libcusparseLt.so.0 SONAME symlink and does NOT add an ld.so.conf.d +# entry. torch dlopens libcusparseLt.so.0 at import time and crashes: +# ImportError: libcusparseLt.so.0: cannot open shared object file +# Register the dir with ld.so AND let ldconfig create the SONAME symlink. +RUN echo /usr/lib/x86_64-linux-gnu/libcusparseLt/13 > /etc/ld.so.conf.d/000_libcusparselt.conf \ + && ldconfig + +# Pin the bundled megatron-core to MEGATRON_CORE_VERSION (see ARG above for +# rationale). Replace the in-place 0.16.1 source tree at +# /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ with the 0.15.3 +# release. Pip's site-packages copy is updated by venv.sh on the host; this +# step covers the copy that runs inside the container. +RUN git clone -b ${MEGATRON_CORE_VERSION} --depth 1 \ + https://github.com/NVIDIA/Megatron-LM.git /tmp/megatron-lm \ + && rm -rf /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core \ + && cp -r /tmp/megatron-lm/megatron/core \ + /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/ \ + && rm -rf /tmp/megatron-lm ################################################# ## Install NVIDIA GDRCopy @@ -72,6 +115,17 @@ ENV PATH /opt/gdrcopy/bin:$PATH ################################################# ## Install EFA installer +## +## The base nemo:26.02 image ships partial EFA components (efa-profile, +## libfabric1-aws, openmpi*-aws, libnccl-ofi-ngc-v2). Their dpkg state +## doesn't match the on-disk files, so the EFA installer's verify step +## refuses to upgrade with "ld.so.conf.d/000_efa.conf is installed by +## efa-profile package but doesn't exist". Purge them first. +RUN dpkg --purge --force-all \ + efa-profile libfabric1-aws libfabric1-aws-dbg \ + openmpi40-aws openmpi50-aws \ + libnccl-ofi-ngc-v2 libnccl-ofi-ngc-v2-dbgsym 2>/dev/null || true + RUN cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ @@ -132,5 +186,9 @@ ENV OMPI_MCA_pml=^cm,ucx \ ## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 ENV PMIX_MCA_gds=hash -# Debug: Verify OFI NCCL and OPENMPI installation -RUN ls -la /opt/amazon/efa/lib/ && ls -la /opt/amazon/ofi-nccl/lib/ && ls -la /opt/amazon/openmpi/lib/ +# Verify EFA / OFI NCCL / OpenMPI installation. NGC's libnccl-ofi-ngc-v2 +# package installs to /opt/amazon/aws-ofi-nccl, while stock EFA installs +# to /opt/amazon/ofi-nccl — accept either. +RUN ls -la /opt/amazon/efa/lib/ \ + && (ls -la /opt/amazon/aws-ofi-nccl/lib/ || ls -la /opt/amazon/ofi-nccl/lib/) \ + && ls -la /opt/amazon/openmpi/lib/ diff --git a/3.test_cases/megatron/nemo/kubernetes/Dockerfile b/3.test_cases/megatron/nemo/kubernetes/Dockerfile index bcb2f9c2b..43d8c6774 100644 --- a/3.test_cases/megatron/nemo/kubernetes/Dockerfile +++ b/3.test_cases/megatron/nemo/kubernetes/Dockerfile @@ -1,10 +1,12 @@ -FROM nvcr.io/nvidia/nemo:25.04.01 -ARG GDRCOPY_VERSION=v2.4.1 -ARG EFA_INSTALLER_VERSION=1.37.0 -ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws -ARG NCCL_VERSION=v2.23.4-1 -ARG NCCL_TESTS_VERSION=v2.13.10 -ARG TRANSFORMERS_VERSION=4.48.1 +FROM nvcr.io/nvidia/nemo:26.02 +ARG GDRCOPY_VERSION=v2.5.2 +ARG EFA_INSTALLER_VERSION=1.48.0 +# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for +# EFA installer >=1.47.0; the explicit source build below is no longer required. +ARG AWS_OFI_NCCL_VERSION=v1.19.0 +ARG NCCL_VERSION=v2.30.4-1 +ARG NCCL_TESTS_VERSION=v2.18.3 +ARG TRANSFORMERS_VERSION=4.57.6 ARG OPEN_MPI_PATH=/opt/amazon/openmpi @@ -52,7 +54,7 @@ RUN rm -rf /root/.ssh/ \ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config -ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH ###################### @@ -87,25 +89,11 @@ RUN cd $HOME \ ################################################### -## Install AWS-OFI-NCCL plugin -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev -#Switch from sh to bash to allow parameter expansion -SHELL ["/bin/bash", "-c"] -RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-mpi=/opt/amazon/openmpi \ - --with-libfabric=/opt/amazon/efa \ - --with-cuda=/usr/local/cuda \ - --enable-platform-aws \ - && make -j $(nproc) \ - && make install \ - && cd .. \ - && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz - -SHELL ["/bin/sh", "-c"] +## AWS OFI NCCL plugin: bundled in /opt/amazon/ofi-nccl by the EFA +## installer (>=1.47.0). The previous explicit source build is no longer +## needed; verify the installation here. +RUN ls -la /opt/amazon/ofi-nccl/lib/ || \ + (echo "AWS OFI NCCL not found in EFA installation; check EFA_INSTALLER_VERSION" && exit 1) ################################################### RUN rm -rf /var/lib/apt/lists/* diff --git a/3.test_cases/megatron/nemo/kubernetes/README.md b/3.test_cases/megatron/nemo/kubernetes/README.md index 156c298d6..01fa661dc 100644 --- a/3.test_cases/megatron/nemo/kubernetes/README.md +++ b/3.test_cases/megatron/nemo/kubernetes/README.md @@ -124,7 +124,7 @@ Before you begin, ensure you have the following: ## 1. Building the AWS-Optimized NeMo Container for EFA Enabled Instances -**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:25.04.01`) is enhanced with AWS-specific optimizations for EFA support. +**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:26.02`) is enhanced with AWS-specific optimizations for EFA support. ### Build the Docker Image @@ -359,7 +359,7 @@ cd data-processing/ ## 5. Launching NeMo Training Jobs -> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:25.04.01`) will work fine and you can omit the `--container_image` parameter. +> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:26.02`) will work fine and you can omit the `--container_image` parameter. ### Overview @@ -388,7 +388,7 @@ The repository provides multiple training scenarios to meet different needs: | `--gpus` | GPU type (e.g., L40S, H100, A10G) | L40S | | `--gpu-devices` | Number of GPUs per node | 4 | | `--efa-devices` | Number of EFA devices per node | None | -| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:25.04.01 | +| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:26.02 | | `--env_vars_file` | JSON file with environment variables | env_vars.json | | `--pvc_name` | Name of the Persistent Volume Claim to use | fsx-claim | | `--pvc_mount_path` | Path where the PVC should be mounted in the container | /mnt/nemo | @@ -467,7 +467,7 @@ python pretrain_mock_dataset.py \ --nodes 1 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo @@ -496,7 +496,7 @@ python finetune_default_dataset.py \ --nodes 1 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo @@ -509,7 +509,7 @@ python finetune_default_dataset.py \ --nodes 1 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo \ @@ -523,7 +523,7 @@ python finetune_default_dataset.py \ --nodes 1 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo \ @@ -554,7 +554,7 @@ python finetune_custom_dataset.py \ --nodes 1 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo \ @@ -570,7 +570,7 @@ python finetune_custom_dataset.py \ --nodes 2 \ --gpus L40S \ --gpu-devices 4 \ - --container_image nvcr.io/nvidia/nemo:25.04.01 \ + --container_image nvcr.io/nvidia/nemo:26.02 \ --env_vars_file env_vars.json \ --pvc_name fsx-claim \ --pvc_mount_path /mnt/nemo \ diff --git a/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml b/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml index 52e03bb14..79cba7b41 100644 --- a/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml +++ b/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml @@ -8,7 +8,7 @@ spec: restartPolicy: Never containers: - name: nemo-processing - image: nvcr.io/nvidia/nemo:25.04.01 + image: nvcr.io/nvidia/nemo:26.02 command: ["/bin/bash"] args: ["-c", "sleep infinity"] resources: diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py b/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py index 210c7bb57..fcf0052d9 100644 --- a/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py +++ b/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py @@ -22,7 +22,7 @@ # function in the CustomDataModule class to match your dataset's structure # ============================================================================= -# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo +# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo def get_parser(): @@ -32,7 +32,7 @@ def get_parser(): parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8) parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None) parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200) - parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12") + parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02") parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json") parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim") parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo") @@ -109,7 +109,7 @@ def skypilot_executor( gpus: str = "L40S", efa_devices: Optional[int] = None, custom_mounts: Optional[dict[str, str]] = None, - container_image: str = "nvcr.io/nvidia/nemo:24.12", + container_image: str = "nvcr.io/nvidia/nemo:26.02", env_vars_file: str = "env_vars.json", pvc_name: str = "nemo-runs", lora_enabled: bool = False, diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py b/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py index 2a6e441da..d56c8f93f 100644 --- a/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py +++ b/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py @@ -16,7 +16,7 @@ from nemo.utils import logging -# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo +# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo def get_parser(): @@ -26,7 +26,7 @@ def get_parser(): parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8) parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None) parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200) - parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12") + parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02") parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json") parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim") parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo") @@ -76,7 +76,7 @@ def skypilot_executor( gpus: str = "L40S", efa_devices: Optional[int] = None, custom_mounts: Optional[dict[str, str]] = None, - container_image: str = "nvcr.io/nvidia/nemo:24.12", + container_image: str = "nvcr.io/nvidia/nemo:26.02", env_vars_file: str = "env_vars.json", pvc_name: str = "nemo-runs", lora_enabled: bool = False, diff --git a/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py b/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py index 04f07bc7e..8a8d182fd 100644 --- a/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py +++ b/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py @@ -12,7 +12,7 @@ from nemo.utils import logging from datasets import load_dataset -# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo +# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo def small_llama_cfg() -> llm.GPTConfig: @@ -36,7 +36,7 @@ def get_parser(): parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8) parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None) parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200) - parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12") + parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02") parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json") parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim") parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo") @@ -54,7 +54,7 @@ def skypilot_executor( gpus: str = "L40S", efa_devices: Optional[int] = None, custom_mounts: Optional[dict[str, str]] = None, - container_image: str = "nvcr.io/nvidia/nemo:24.12", + container_image: str = "nvcr.io/nvidia/nemo:26.02", env_vars_file: str = "env_vars.json", pvc_name: str = "nemo-runs" ) -> run.SkypilotExecutor: diff --git a/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py b/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py index 9a807a59e..c36345981 100644 --- a/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py +++ b/3.test_cases/megatron/nemo/kubernetes/pretrain_mock_dataset.py @@ -15,7 +15,7 @@ from nemo.lightning.run import plugins -# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo +# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo def small_llama_cfg() -> llm.GPTConfig: @@ -39,7 +39,7 @@ def get_parser(): parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8) parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None) parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200) - parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12") + parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02") parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json") parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim") parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo") @@ -54,7 +54,7 @@ def skypilot_executor( gpus: str = "L40S", efa_devices: Optional[int] = None, custom_mounts: Optional[dict[str, str]] = None, - container_image: str = "nvcr.io/nvidia/nemo:24.12", + container_image: str = "nvcr.io/nvidia/nemo:26.02", env_vars_file: str = "env_vars.json", pvc_name: str = "nemo-runs", ) -> run.SkypilotExecutor: diff --git a/3.test_cases/megatron/nemo/slurm/README.md b/3.test_cases/megatron/nemo/slurm/README.md index 1dc51acbf..72abe4205 100644 --- a/3.test_cases/megatron/nemo/slurm/README.md +++ b/3.test_cases/megatron/nemo/slurm/README.md @@ -30,8 +30,8 @@ Before running NeMo jobs, build a custom optimized container image for EFA and C Build Image: ```bash - docker build --progress=plain -t aws-nemo:25.07 -f ../Dockerfile .. - enroot import -o ~/aws-nemo-25-07.sqsh dockerd://aws-nemo:25.07 + docker build --progress=plain -t aws-nemo:26.02 -f ../Dockerfile .. + enroot import -o ~/aws-nemo-26-02.sqsh dockerd://aws-nemo:26.02 ``` ## 5. Install Dependencies and Prepare NeMo 2.0 Environment @@ -73,7 +73,7 @@ In NeMo-Run, you can build and configure everything using Python, eliminating th In this example, we run the following script to start the LLaMa 8B pretraining job: ```bash - python run.py --container_image ~/aws-nemo-25-07.sqsh --nodes 2 --partition dev --env_vars_file env_vars.json --max_steps 1000 + python run.py --container_image ~/aws-nemo-26-02.sqsh --nodes 2 --partition dev --env_vars_file env_vars.json --max_steps 1000 ``` ## 7. References diff --git a/3.test_cases/megatron/nemo/slurm/env_vars.json b/3.test_cases/megatron/nemo/slurm/env_vars.json index 3557d0a1e..27c3ce66a 100644 --- a/3.test_cases/megatron/nemo/slurm/env_vars.json +++ b/3.test_cases/megatron/nemo/slurm/env_vars.json @@ -2,8 +2,8 @@ "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", "FI_EFA_USE_HUGE_PAGE": "0", "NCCL_DEBUG": "INFO", - "FI_PROVIDER": "efa" + "FI_PROVIDER": "efa", + "PATH": "/opt/venv/bin:/opt/slurm/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" } diff --git a/3.test_cases/megatron/nemo/slurm/run.py b/3.test_cases/megatron/nemo/slurm/run.py index 754c8f51b..e6160ef9f 100644 --- a/3.test_cases/megatron/nemo/slurm/run.py +++ b/3.test_cases/megatron/nemo/slurm/run.py @@ -8,7 +8,7 @@ from typing import Any, Optional from nemo.collections import llm from nemo.lightning.run import plugins -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.collections.common.tokenizers.tokenizer_utils import get_nmt_tokenizer from nemo.collections.llm.recipes.callbacks.common import straggler_det_callback from nemo.lightning.pytorch.callbacks import PreemptionCallback from nemo.lightning.run import plugins @@ -37,7 +37,7 @@ def get_parser(): parser.add_argument("--nodes", type=int, help="Number of nodes to run on", default=1) parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200) parser.add_argument("--account", type=str, help="Slurm account to use", default="ubuntu") - parser.add_argument("--container_image", type=str, help="Container image to use", default="/fsx/ubuntu/aws-nemo-24-12.sqsh") + parser.add_argument("--container_image", type=str, help="Container image to use", default="/fsx/ubuntu/aws-nemo-26-02.sqsh") parser.add_argument("--time", type=str, help="Time to run the job", default="01:00:00") parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json") parser.add_argument("--ntasks_per_node", type=int, help="Number of tasks per node", default=8) @@ -54,7 +54,7 @@ def slurm_executor( remote_job_dir: str = "/fsx/ubuntu/aws-nemo", time: str = "01:00:00", custom_mounts: Optional[list[str]] = None, - container_image: str = "/fsx/ubuntu/aws-nemo-24-12.sqsh", + container_image: str = "/fsx/ubuntu/aws-nemo-26-02.sqsh", env_vars_file: str = "env_vars.json", ntasks_per_node: int = 8, retries: int = 0, diff --git a/3.test_cases/megatron/nemo/slurm/venv.sh b/3.test_cases/megatron/nemo/slurm/venv.sh index 9e0972a7e..78ea5e5c6 100644 --- a/3.test_cases/megatron/nemo/slurm/venv.sh +++ b/3.test_cases/megatron/nemo/slurm/venv.sh @@ -3,29 +3,32 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 -set -e +# Host-side venv for NeMo-Run. The actual training runs inside the +# `nvcr.io/nvidia/nemo:26.02` container (built by ../Dockerfile); this venv +# only needs to satisfy NeMo-Run's import-time deps on the head node. -# Install NeMo-Run -pip install git+https://github.com/NVIDIA/NeMo-Run.git@4d056535b5cce475b0536243e2cefcfa3897eee8 +set -e -# # Install PyTorch -pip install torch==2.6.0 - -# Install Megatron-LM -pip install --no-deps git+https://github.com/NVIDIA/Megatron-LM.git@b5d90de8e7c7fae5f35be89d665f237970540bed +# Pin to NeMo-Run v0.9.0 (Apr 2026 tag) instead of an arbitrary commit, so +# `bash venv.sh` produces the same environment across runs. +pip install "nemo-run==0.9.0" -# # Download and install Mamba SSM -wget https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl # Adjusted for torch 2.0 -pip install mamba_ssm-2.2.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl -rm mamba_ssm-2.2.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl +# Torch is a NeMo-Run import-time dep on the host; CUDA flavor doesn't matter +# here because all GPU work happens inside the container. +pip install "torch==2.10.0" -# Install NeMo Toolkit -pip install nemo_toolkit['all']==2.1.0 +# Megatron-LM pinned to the version that NeMo 2.7.x is API-compatible with +# (must match MEGATRON_CORE_VERSION in ../Dockerfile). nemo:26.02 ships +# 0.16.1 alongside NeMo 2.7.1 by mistake — 0.16.x removed kwargs and +# submodules NeMo 2.7.x still references (see Dockerfile comment for +# details). 0.15.3 is the latest 0.15.x patch. +pip install --no-deps "git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.3" -# Install OpenCC -pip install opencc==1.1.6 +# NeMo Toolkit. PERFORMANCE.md (in this directory's parent) lists 2.5+ as +# recommended on the NeMo 26.02 container. 2.7.3 is the latest patch in 2.x. +pip install "nemo_toolkit[all]==2.7.3" -# Clone and install NVIDIA Resiliency Extension -pip install nvidia-resiliency-ext="v0.2.1" +# NVIDIA Resiliency Extension for fault-tolerance plugins used in run.py. +pip install "nvidia-resiliency-ext==0.4.1" echo "Environment setup complete."