From 6bb715669289cd849a7d7db78cd0b183e7a4aa40 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Tue, 24 Jun 2025 16:36:07 +0200 Subject: [PATCH 1/5] Create for utils function nvidia-smi check Signed-off-by: laraPPr --- EESSI-install-software.sh | 16 +--------------- bot/build.sh | 12 +----------- bot/test.sh | 11 +---------- scripts/utils.sh | 13 +++++++++++++ 4 files changed, 16 insertions(+), 36 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index c491819f..a674fa6f 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,11 +17,6 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } -# Function to check if a command exists -function command_exists() { - command -v "$1" >/dev/null 2>&1 -} - function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -307,16 +302,7 @@ fi # Install NVIDIA drivers in host_injections (if they exist) if command_exists "nvidia-smi"; then export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi + check_nvidia-smi_installation fi if [ ! -z "${shared_fs_path}" ]; then diff --git a/bot/build.sh b/bot/build.sh index 2bba0cba..7875e70f 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -264,18 +264,8 @@ BUILD_STEP_ARGS+=("--storage" "${STORAGE}") if command_exists "nvidia-smi"; then # Accept that this may fail set +e - nvidia-smi --version - ec=$? + check_nvidia-smi_installation set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - BUILD_STEP_ARGS+=("--nvidia" "install") - fi else echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" BUILD_STEP_ARGS+=("--nvidia" "install") diff --git a/bot/test.sh b/bot/test.sh index 168b0d5c..0eaf07dd 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -222,17 +222,8 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") if command_exists "nvidia-smi"; then # Accept that this may fail set +e - nvidia-smi --version - ec=$? + check_nvidia-smi_installation set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi fi # prepare arguments to test_suite.sh (specific to test step) diff --git a/scripts/utils.sh b/scripts/utils.sh index 962decd2..cb0a5fe7 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -147,3 +147,16 @@ function get_ipv4_address { echo "${hipv4}" return 0 } + +function check_nvidia-smi_installation { + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + fi +} From fb8fdfd673cc6e449d23cc50fef0efc807b8532b Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 31 Jul 2025 16:11:16 +0200 Subject: [PATCH 2/5] Update tests_scripts.yml --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 7a8f5fa4..8b55b9a3 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -51,7 +51,7 @@ jobs: # can't test with EasyBuild versions older than v4.5.2 when using EESSI 2023.06, # since Python in compat layer is Python 3.11.x; # testing with a single EasyBuild version takes a while in GitHub Actions, so stick to a single sensible version - for EB_VERSION in '4.6.0'; do + for EB_VERSION in '5.1.0'; do # Create script that uses load_easybuild_module.sh which we can run in compat layer environment # note: Be careful with single vs double quotes below! # ${EB_VERSION} should be expanded, so use double quotes; From 986f58d27a9d25e04f7a0e4480c2a782afce164d Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 31 Jul 2025 16:24:13 +0200 Subject: [PATCH 3/5] Update tests_scripts.yml --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 8b55b9a3..db942c57 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -113,7 +113,7 @@ jobs: # scripts need to be copied to /tmp, # since create_directory_tarballs.sh must be accessible from within build container - ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh 2023.06 + ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh "${{matrix.EESSI_VERSION}}" # check if tarballs have been produced ls -l *.tar.gz From 4c33e6a1c27dd8d1afb8ac41d66b7e003be4d373 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Fri, 1 Aug 2025 11:30:21 +0200 Subject: [PATCH 4/5] add test build --- .../nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml new file mode 100644 index 00000000..0bf49b79 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen3_a100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -0,0 +1,2 @@ +easyconfigs: + - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb From 49de3c3a9f361b7dc3b002099c9a8e04362a31bb Mon Sep 17 00:00:00 2001 From: laraPPr Date: Fri, 1 Aug 2025 13:47:13 +0200 Subject: [PATCH 5/5] fix the nvidia-smi utils funtion Signed-off-by: laraPPr --- EESSI-install-software.sh | 11 +++++++++-- bot/build.sh | 17 ++++++++++------- bot/test.sh | 12 +++++++----- scripts/utils.sh | 24 +++++++++++++++--------- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 4b0868ed..c76745cc 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -310,10 +310,17 @@ else fi # Install NVIDIA drivers in host_injections (if they exist) -if command_exists "nvidia-smi"; then +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +elif [ ${ec} -eq 1 ]; then export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - check_nvidia-smi_installation fi +set -e if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources diff --git a/bot/build.sh b/bot/build.sh index b40cdfac..6a08c7f2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -220,15 +220,18 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - check_nvidia-smi_installation - set -e -else - echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + BUILD_STEP_ARGS+=("--nvidia" "all") +elif [ ${ec} -eq 1 ]; then + BUILD_STEP_ARGS+=("--nvidia" "install") +elif [ ${ec} -eq 2 ]; then BUILD_STEP_ARGS+=("--nvidia" "install") fi +set -e # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) diff --git a/bot/test.sh b/bot/test.sh index cfeccc99..0cb10174 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,12 +225,14 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - check_nvidia-smi_installation - set -e +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + TEST_STEP_ARGS+=("--nvidia" "run") fi +set -e # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() diff --git a/scripts/utils.sh b/scripts/utils.sh index cb0a5fe7..2adc1a0d 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -148,15 +148,21 @@ function get_ipv4_address { return 0 } -function check_nvidia-smi_installation { - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +function verify_nvidia-smi { + if command_exists "nvidia-smi"; then + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + return 0 + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + return 1 + fi else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + echo echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + return 2 fi }