awslabs · KeitaW · May 15, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/3.test_cases/megatron/nemo/Dockerfile b/3.test_cases/megatron/nemo/Dockerfile
@@ -1,10 +1,19 @@
-FROM nvcr.io/nvidia/nemo:25.07.00
-ARG GDRCOPY_VERSION=v2.5
-ARG EFA_INSTALLER_VERSION=1.47.0
-# ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws    # OFI NCCL already packaged into EFA installation (/opt/amazon/ofi-nccl) cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
-ARG NCCL_VERSION=v2.27.7-1
-ARG NCCL_TESTS_VERSION=v2.16.9
-ARG TRANSFORMERS_VERSION=4.56.1
+FROM nvcr.io/nvidia/nemo:26.02
+ARG GDRCOPY_VERSION=v2.5.2
+ARG EFA_INSTALLER_VERSION=1.48.0
+# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for
+# EFA installer >=1.47.0. Cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
+ARG AWS_OFI_NCCL_VERSION=v1.19.0
+ARG NCCL_VERSION=v2.30.4-1
+ARG NCCL_TESTS_VERSION=v2.18.3
+ARG TRANSFORMERS_VERSION=4.57.6
+# Pin megatron-core to the version that NeMo 2.7.x is API-compatible with.
+# nemo:26.02 ships megatron-core 0.16.1 at /opt/Megatron-Bridge/3rdparty/Megatron-LM/
+# but bundled NeMo 2.7.1 calls APIs 0.16.x removed (get_megatron_optimizer
+# kwargs no_weight_decay_cond/scale_lr_cond/lr_mult) and imports submodules
+# 0.16.x dropped (megatron.core.dist_checkpointing.strategies.tensorstore).
+# 0.15.3 is the latest 0.15.x release on PyPI / GitHub.
+ARG MEGATRON_CORE_VERSION=core_v0.15.3
 
 
 ARG OPEN_MPI_PATH=/opt/amazon/openmpi    # Open MPI already packaged into EFA installation (/opt/amazon/openmpi)
@@ -38,8 +47,13 @@ RUN DEBIAN_FRONTEND=noninteractive apt install -y --allow-unauthenticated \
     libtool \
     openssh-client \
     openssh-server \
-    vim \
-    && apt autoremove -y
+    vim
+# NOTE: deliberately no `apt autoremove` here. The nemo:26.02 base image
+# ships several CUDA add-on libraries (libcusparseLt0, libcudnn-frontend,
+# etc.) that apt sees as orphaned because no installed Debian package
+# Depends: them — they're loaded via dlopen by torch / transformer_engine
+# at runtime. autoremove deletes them and torch.import then crashes with
+# `libcusparseLt.so.0: cannot open shared object file`.
 
 RUN mkdir -p /var/run/sshd && \
     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
@@ -53,8 +67,37 @@ RUN rm -rf /root/.ssh/ \
  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
-ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+# NGC images install the OFI NCCL plugin via the libnccl-ofi-ngc-v2 package
+# from the EFA installer, which lands at /opt/amazon/aws-ofi-nccl/ rather than
+# /opt/amazon/ofi-nccl/ used on stock Ubuntu. Cover both for portability.
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/aws-ofi-nccl/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
+# Prepend /opt/venv/bin so every `python`/`pip` resolves to the uv-managed
+# venv where nemo, megatron, nemo_run, etc. live. The nemo:26.02 image's
+# default PATH puts /usr/bin BEFORE /opt/venv/bin, which breaks torchelastic
+# worker spawn: `ft_launcher` finds /usr/bin/python (no nemo_run) instead of
+# /opt/venv/bin/python and crashes with `ModuleNotFoundError: nemo_run`.
+ENV PATH=/opt/venv/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+# nemo:26.02 ships libcusparseLt0 (libcusparselt0-cuda-13) into the nested
+# /usr/lib/x86_64-linux-gnu/libcusparseLt/13/ directory but does NOT create
+# the libcusparseLt.so.0 SONAME symlink and does NOT add an ld.so.conf.d
+# entry. torch dlopens libcusparseLt.so.0 at import time and crashes:
+#   ImportError: libcusparseLt.so.0: cannot open shared object file
+# Register the dir with ld.so AND let ldconfig create the SONAME symlink.
+RUN echo /usr/lib/x86_64-linux-gnu/libcusparseLt/13 > /etc/ld.so.conf.d/000_libcusparselt.conf \
+ && ldconfig
+
+# Pin the bundled megatron-core to MEGATRON_CORE_VERSION (see ARG above for
+# rationale). Replace the in-place 0.16.1 source tree at
+# /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ with the 0.15.3
+# release. Pip's site-packages copy is updated by venv.sh on the host; this
+# step covers the copy that runs inside the container.
+RUN git clone -b ${MEGATRON_CORE_VERSION} --depth 1 \
+        https://github.com/NVIDIA/Megatron-LM.git /tmp/megatron-lm \
+ && rm -rf /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core \
+ && cp -r /tmp/megatron-lm/megatron/core \
+        /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/ \
+ && rm -rf /tmp/megatron-lm
 
 #################################################
 ## Install NVIDIA GDRCopy
@@ -72,6 +115,17 @@ ENV PATH /opt/gdrcopy/bin:$PATH
 
 #################################################
 ## Install EFA installer
+##
+## The base nemo:26.02 image ships partial EFA components (efa-profile,
+## libfabric1-aws, openmpi*-aws, libnccl-ofi-ngc-v2). Their dpkg state
+## doesn't match the on-disk files, so the EFA installer's verify step
+## refuses to upgrade with "ld.so.conf.d/000_efa.conf is installed by
+## efa-profile package but doesn't exist". Purge them first.
+RUN dpkg --purge --force-all \
+    efa-profile libfabric1-aws libfabric1-aws-dbg \
+    openmpi40-aws openmpi50-aws \
+    libnccl-ofi-ngc-v2 libnccl-ofi-ngc-v2-dbgsym 2>/dev/null || true
+
 RUN cd $HOME \
     && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
     && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
@@ -132,5 +186,9 @@ ENV OMPI_MCA_pml=^cm,ucx            \
 ## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
 ENV PMIX_MCA_gds=hash
 
-# Debug: Verify OFI NCCL and OPENMPI installation
-RUN ls -la /opt/amazon/efa/lib/ && ls -la /opt/amazon/ofi-nccl/lib/ && ls -la /opt/amazon/openmpi/lib/
+# Verify EFA / OFI NCCL / OpenMPI installation. NGC's libnccl-ofi-ngc-v2
+# package installs to /opt/amazon/aws-ofi-nccl, while stock EFA installs
+# to /opt/amazon/ofi-nccl — accept either.
+RUN ls -la /opt/amazon/efa/lib/ \
+ && (ls -la /opt/amazon/aws-ofi-nccl/lib/ || ls -la /opt/amazon/ofi-nccl/lib/) \
+ && ls -la /opt/amazon/openmpi/lib/
diff --git a/3.test_cases/megatron/nemo/kubernetes/Dockerfile b/3.test_cases/megatron/nemo/kubernetes/Dockerfile
@@ -1,10 +1,12 @@
-FROM nvcr.io/nvidia/nemo:25.04.01
-ARG GDRCOPY_VERSION=v2.4.1
-ARG EFA_INSTALLER_VERSION=1.37.0
-ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
-ARG NCCL_VERSION=v2.23.4-1
-ARG NCCL_TESTS_VERSION=v2.13.10
-ARG TRANSFORMERS_VERSION=4.48.1
+FROM nvcr.io/nvidia/nemo:26.02
+ARG GDRCOPY_VERSION=v2.5.2
+ARG EFA_INSTALLER_VERSION=1.48.0
+# AWS OFI NCCL is bundled into the EFA installation (/opt/amazon/ofi-nccl) for
+# EFA installer >=1.47.0; the explicit source build below is no longer required.
+ARG AWS_OFI_NCCL_VERSION=v1.19.0
+ARG NCCL_VERSION=v2.30.4-1
+ARG NCCL_TESTS_VERSION=v2.18.3
+ARG TRANSFORMERS_VERSION=4.57.6
 
 ARG OPEN_MPI_PATH=/opt/amazon/openmpi
 
@@ -52,7 +54,7 @@ RUN rm -rf /root/.ssh/ \
  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 
-ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib:$LD_LIBRARY_PATH
 ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
 
 ######################
@@ -87,25 +89,11 @@ RUN cd $HOME \
 
 
 ###################################################
-## Install AWS-OFI-NCCL plugin
-RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
-#Switch from sh to bash to allow parameter expansion
-SHELL ["/bin/bash", "-c"]
-RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
-    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && ./configure --prefix=/opt/aws-ofi-nccl/install \
-        --with-mpi=/opt/amazon/openmpi \
-        --with-libfabric=/opt/amazon/efa \
-        --with-cuda=/usr/local/cuda \
-        --enable-platform-aws \
-    && make -j $(nproc) \
-    && make install \
-    && cd .. \
-    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
-    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
-
-SHELL ["/bin/sh", "-c"]
+## AWS OFI NCCL plugin: bundled in /opt/amazon/ofi-nccl by the EFA
+## installer (>=1.47.0). The previous explicit source build is no longer
+## needed; verify the installation here.
+RUN ls -la /opt/amazon/ofi-nccl/lib/ || \
+    (echo "AWS OFI NCCL not found in EFA installation; check EFA_INSTALLER_VERSION" && exit 1)
 
 ###################################################
 RUN rm -rf /var/lib/apt/lists/*

diff --git a/3.test_cases/megatron/nemo/kubernetes/README.md b/3.test_cases/megatron/nemo/kubernetes/README.md
@@ -124,7 +124,7 @@ Before you begin, ensure you have the following:
 
 ## 1. Building the AWS-Optimized NeMo Container for EFA Enabled Instances
 
-**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:25.04.01`) is enhanced with AWS-specific optimizations for EFA support.
+**If you're not using an EFA enabled instance type, you can skip this step**. Here the base NeMo image (`nvcr.io/nvidia/nemo:26.02`) is enhanced with AWS-specific optimizations for EFA support.
 
 ### Build the Docker Image
 
@@ -359,7 +359,7 @@ cd data-processing/
 
 ## 5. Launching NeMo Training Jobs
 
-> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:25.04.01`) will work fine and you can omit the `--container_image` parameter.
+> **Note**: The AWS-optimized container with EFA support can only be used for EFA enabled instances. For non-EFA usage, the default NeMo container (`nvcr.io/nvidia/nemo:26.02`) will work fine and you can omit the `--container_image` parameter.
 
 ### Overview
 
@@ -388,7 +388,7 @@ The repository provides multiple training scenarios to meet different needs:
 | `--gpus` | GPU type (e.g., L40S, H100, A10G) | L40S |
 | `--gpu-devices` | Number of GPUs per node | 4 |
 | `--efa-devices` | Number of EFA devices per node | None |
-| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:25.04.01 |
+| `--container_image` | Container image for training (required for using EFA) | nvcr.io/nvidia/nemo:26.02 |
 | `--env_vars_file` | JSON file with environment variables | env_vars.json |
 | `--pvc_name` | Name of the Persistent Volume Claim to use | fsx-claim |
 | `--pvc_mount_path` | Path where the PVC should be mounted in the container | /mnt/nemo |
@@ -467,7 +467,7 @@ python pretrain_mock_dataset.py \
     --nodes 1 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo
@@ -496,7 +496,7 @@ python finetune_default_dataset.py \
     --nodes 1 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo
@@ -509,7 +509,7 @@ python finetune_default_dataset.py \
     --nodes 1 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo \
@@ -523,7 +523,7 @@ python finetune_default_dataset.py \
     --nodes 1 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo \
@@ -554,7 +554,7 @@ python finetune_custom_dataset.py \
     --nodes 1 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo \
@@ -570,7 +570,7 @@ python finetune_custom_dataset.py \
     --nodes 2 \
     --gpus L40S \
     --gpu-devices 4 \
-    --container_image nvcr.io/nvidia/nemo:25.04.01 \
+    --container_image nvcr.io/nvidia/nemo:26.02 \
     --env_vars_file env_vars.json \
     --pvc_name fsx-claim \
     --pvc_mount_path /mnt/nemo \

diff --git a/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml b/3.test_cases/megatron/nemo/kubernetes/data-processing/data-processing-pod-template.yaml
@@ -8,7 +8,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: nemo-processing
-    image: nvcr.io/nvidia/nemo:25.04.01
+    image: nvcr.io/nvidia/nemo:26.02
     command: ["/bin/bash"]
     args: ["-c", "sleep infinity"]
     resources:

diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py b/3.test_cases/megatron/nemo/kubernetes/finetune_custom_dataset.py
@@ -22,7 +22,7 @@
 # function in the CustomDataModule class to match your dataset's structure
 # =============================================================================
 
-# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
+# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
 
 
 def get_parser():
@@ -32,7 +32,7 @@ def get_parser():
    parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
    parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
    parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
-   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
+   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
    parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
    parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
    parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
@@ -109,7 +109,7 @@ def skypilot_executor(
    gpus: str = "L40S",
    efa_devices: Optional[int] = None,
    custom_mounts: Optional[dict[str, str]] = None,
-   container_image: str = "nvcr.io/nvidia/nemo:24.12",
+   container_image: str = "nvcr.io/nvidia/nemo:26.02",
    env_vars_file: str = "env_vars.json",
    pvc_name: str = "nemo-runs",
    lora_enabled: bool = False,

diff --git a/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py b/3.test_cases/megatron/nemo/kubernetes/finetune_default_dataset.py
@@ -16,7 +16,7 @@
 from nemo.utils import logging
 
 
-# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
+# python finetune.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
 
 
 def get_parser():
@@ -26,7 +26,7 @@ def get_parser():
    parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
    parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
    parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
-   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
+   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
    parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
    parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
    parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
@@ -76,7 +76,7 @@ def skypilot_executor(
    gpus: str = "L40S",
    efa_devices: Optional[int] = None,
    custom_mounts: Optional[dict[str, str]] = None,
-   container_image: str = "nvcr.io/nvidia/nemo:24.12",
+   container_image: str = "nvcr.io/nvidia/nemo:26.02",
    env_vars_file: str = "env_vars.json",
    pvc_name: str = "nemo-runs",
    lora_enabled: bool = False,

diff --git a/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py b/3.test_cases/megatron/nemo/kubernetes/pretrain_custom_dataset.py
@@ -12,7 +12,7 @@
 from nemo.utils import logging
 from datasets import load_dataset
 
-# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:24.12 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
+# python pretrain.py --max_steps 200 --nodes 1 --gpus L40S --gpu-devices 8 --container_image nvcr.io/nvidia/nemo:26.02 --env_vars_file env_vars.json --pvc_name fsx-claim --pvc_mount_path /mnt/nemo
 
 
 def small_llama_cfg() -> llm.GPTConfig:
@@ -36,7 +36,7 @@ def get_parser():
    parser.add_argument("--gpu-devices", type=int, help="Number of GPUs per node", default=8)
    parser.add_argument("--efa-devices", type=int, help="Number of EFA devices per node", default=None)
    parser.add_argument("--max_steps", type=int, help="Maximum number of steps", default=200)
-   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:24.12")
+   parser.add_argument("--container_image", type=str, help="Container image to use", default="nvcr.io/nvidia/nemo:26.02")
    parser.add_argument("--env_vars_file", type=str, help="Path to the JSON file with environment variables", default="env_vars.json")
    parser.add_argument("--pvc_name", type=str, help="Name of the Persistent Volume Claim to use", default="fsx-claim")
    parser.add_argument("--pvc_mount_path", type=str, help="Path where the PVC should be mounted in the container", default="/mnt/nemo")
@@ -54,7 +54,7 @@ def skypilot_executor(
    gpus: str = "L40S",
    efa_devices: Optional[int] = None,
    custom_mounts: Optional[dict[str, str]] = None,
-   container_image: str = "nvcr.io/nvidia/nemo:24.12",
+   container_image: str = "nvcr.io/nvidia/nemo:26.02",
    env_vars_file: str = "env_vars.json",
    pvc_name: str = "nemo-runs"
 ) -> run.SkypilotExecutor: