From 040514f4e96d699c1e2751df6f9c09aeccb3cfbe Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 10:53:03 +0100 Subject: [PATCH 01/27] Add cicd-ext CI configuration --- .cscs-ci/container/build.Containerfile | 8 ++ .cscs-ci/container/deps.Containerfile | 24 ++++ .cscs-ci/default.yaml | 191 +++++++++++++++++++++++++ .cscs-ci/spack/libfabric.yaml | 6 + .cscs-ci/spack/mpi.yaml | 6 + .cscs-ci/spack/nccl.yaml | 6 + .cscs-ci/spack/ucx.yaml | 6 + test/CMakeLists.txt | 3 +- test/bindings/fortran/CMakeLists.txt | 3 +- 9 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 .cscs-ci/container/build.Containerfile create mode 100644 .cscs-ci/container/deps.Containerfile create mode 100644 .cscs-ci/default.yaml create mode 100644 .cscs-ci/spack/libfabric.yaml create mode 100644 .cscs-ci/spack/mpi.yaml create mode 100644 .cscs-ci/spack/nccl.yaml create mode 100644 .cscs-ci/spack/ucx.yaml diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile new file mode 100644 index 00000000..784221cb --- /dev/null +++ b/.cscs-ci/container/build.Containerfile @@ -0,0 +1,8 @@ +ARG DEPS_IMAGE +FROM $DEPS_IMAGE + +COPY . /oomph +WORKDIR /oomph + +RUN spack -e ci build-env oomph -- cmake -B build -DOOMPH_WITH_TESTING=ON -DMPIEXEC_EXECUTABLE="" -DMPIEXEC_NUMPROC_FLAG="" -DMPIEXEC_PREFLAGS="" -DMPIEXEC_POSTFLAGS="" && \ + spack -e ci build-env oomph -- cmake --build build -j$(nproc) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile new file mode 100644 index 00000000..73c225c1 --- /dev/null +++ b/.cscs-ci/container/deps.Containerfile @@ -0,0 +1,24 @@ +FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base + +ARG SPACK_SHA=develop +ARG SPACK_PACKAGES_SHA=main +ARG SPACK_ENV_FILE + +ENV DEBIAN_FRONTEND=noninteractive + +RUN mkdir -p /opt/spack && \ + curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack + +ENV PATH="/opt/spack/bin:$PATH" + +RUN mkdir -p /opt/spack-packages && \ + curl -Ls "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages + +RUN spack repo remove --scope defaults:base builtin && \ + spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin + +COPY $SPACK_ENV_FILE /spack_environment/spack.yaml + +RUN spack env create ci /spack_environment/spack.yaml && \ + spack -e ci concretize -f && \ + spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml new file mode 100644 index 00000000..959ba3d4 --- /dev/null +++ b/.cscs-ci/default.yaml @@ -0,0 +1,191 @@ +include: + - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + +stages: + - build_deps + - build + - test + +variables: + # The base image is the py26.01 alps3 image from docs.cscs.ch + BASE_IMAGE: ghcr.io/eth-cscs/alps-images:py26.01-alps3-base + SPACK_SHA: develop + SPACK_PACKAGES_SHA: main + FF_TIMESTAMPS: true + +.build_deps_template: + stage: build_deps + timeout: 1 hours + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16` + - export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16` + - export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16` + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-spack-deps-$BACKEND:$CONFIG_TAG + - echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env + - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/deps.Containerfile + DOCKER_BUILD_ARGS: '["SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + artifacts: + reports: + dotenv: base-${BACKEND}.env + +build_deps_nccl: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: nccl + SPACK_ENV_FILE: .cscs-ci/spack/nccl.yaml + +build_deps_mpi: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: mpi + SPACK_ENV_FILE: .cscs-ci/spack/mpi.yaml + +build_deps_ucx: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: ucx + SPACK_ENV_FILE: .cscs-ci/spack/ucx.yaml + +build_deps_libfabric: + extends: + - .container-builder-cscs-gh200 + - .build_deps_template + variables: + BACKEND: libfabric + SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml + +.build_template: + stage: build + extends: .container-builder-cscs-gh200 + timeout: 1 hours + before_script: + - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-build-$BACKEND:$CI_COMMIT_SHA + - echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env + variables: + DOCKERFILE: .cscs-ci/container/build.Containerfile + DOCKER_BUILD_ARGS: '["DEPS_IMAGE"]' + artifacts: + reports: + dotenv: build-${BACKEND}.env + +build_nccl: + extends: .build_template + needs: + - job: build_deps_nccl + artifacts: true + variables: + BACKEND: nccl + +build_mpi: + extends: .build_template + needs: + - job: build_deps_mpi + artifacts: true + variables: + BACKEND: mpi + +build_ucx: + extends: .build_template + needs: + - job: build_deps_ucx + artifacts: true + variables: + BACKEND: ucx + +build_libfabric: + extends: .build_template + needs: + - job: build_deps_libfabric + artifacts: true + variables: + BACKEND: libfabric + +.test_serial_template: + stage: test + extends: .container-runner-clariden-gh200 + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_NTASKS: 1 + SLURM_TIMELIMIT: '00:15:00' + SLURM_PARTITION: normal + script: + - ctest --test-dir build -L "serial" --output-on-failure + +.test_parallel_template: + stage: test + extends: .container-runner-clariden-gh200 + variables: + SLURM_JOB_NUM_NODES: 1 + SLURM_NTASKS: 4 + SLURM_TIMELIMIT: '00:15:00' + SLURM_PARTITION: normal + SLURM_MPI: pmix + MPICH_GPU_SUPPORT_ENABLED: 1 + script: + - srun -n 4 ctest --test-dir build -L "parallel-ranks-4" --output-on-failure + +test_serial_nccl: + extends: .test_serial_template + needs: + - job: build_nccl + artifacts: true + image: $BUILD_IMAGE + +test_parallel_nccl: + extends: .test_parallel_template + needs: + - job: build_nccl + artifacts: true + image: $BUILD_IMAGE + +test_serial_mpi: + extends: .test_serial_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_parallel_mpi: + extends: .test_parallel_template + needs: + - job: build_mpi + artifacts: true + image: $BUILD_IMAGE + +test_serial_ucx: + extends: .test_serial_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_parallel_ucx: + extends: .test_parallel_template + needs: + - job: build_ucx + artifacts: true + image: $BUILD_IMAGE + +test_serial_libfabric: + extends: .test_serial_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE + +test_parallel_libfabric: + extends: .test_parallel_template + needs: + - job: build_libfabric + artifacts: true + image: $BUILD_IMAGE diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml new file mode 100644 index 00000000..f659f278 --- /dev/null +++ b/.cscs-ci/spack/libfabric.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=libfabric +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml new file mode 100644 index 00000000..696d894d --- /dev/null +++ b/.cscs-ci/spack/mpi.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=mpi +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml new file mode 100644 index 00000000..2dc59834 --- /dev/null +++ b/.cscs-ci/spack/nccl.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=nccl +cuda +python + view: false + concretizer: + unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml new file mode 100644 index 00000000..76100e29 --- /dev/null +++ b/.cscs-ci/spack/ucx.yaml @@ -0,0 +1,6 @@ +spack: + specs: + - oomph backend=ucx +cuda +python + view: false + concretizer: + unify: true diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5217bbaf..31fea066 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -48,6 +48,7 @@ function(reg_serial_test t) add_test( NAME ${t} COMMAND $) + set_tests_properties(${t} PROPERTIES LABELS "serial") endfunction() foreach(t ${serial_tests}) @@ -65,7 +66,7 @@ function(reg_parallel_test t_ lib n) NAME ${t} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} $ ${MPIEXEC_POSTFLAGS}) - set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}") endfunction() if (OOMPH_WITH_MPI) diff --git a/test/bindings/fortran/CMakeLists.txt b/test/bindings/fortran/CMakeLists.txt index 974d2f7c..10e69e15 100644 --- a/test/bindings/fortran/CMakeLists.txt +++ b/test/bindings/fortran/CMakeLists.txt @@ -30,7 +30,8 @@ function(reg_parallel_test_f t_ lib n nthr) COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} $ ${MPIEXEC_POSTFLAGS}) set_tests_properties(${t} PROPERTIES - ENVIRONMENT OMP_NUM_THREADS=${nthr}) + ENVIRONMENT OMP_NUM_THREADS=${nthr} + LABELS "parallel-ranks-${n}") endfunction() if (OOMPH_WITH_MPI) From ba1784c191cc247422c4d6c2a7ae159482f5af9c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 16:45:03 +0100 Subject: [PATCH 02/27] Apply suggestions from code review Co-authored-by: Mikael Simberg --- .cscs-ci/container/deps.Containerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 73c225c1..a07e4797 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,7 +1,7 @@ FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base -ARG SPACK_SHA=develop -ARG SPACK_PACKAGES_SHA=main +ARG SPACK_SHA=v1.1.1 +ARG SPACK_PACKAGES_SHA=bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 ARG SPACK_ENV_FILE ENV DEBIAN_FRONTEND=noninteractive From 8cf50769349d3f432e602a44f2c48a94e56d769d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 16:45:54 +0100 Subject: [PATCH 03/27] Apply suggestion from @msimberg --- .cscs-ci/container/deps.Containerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index a07e4797..32920e3d 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -19,6 +19,7 @@ RUN spack repo remove --scope defaults:base builtin && \ COPY $SPACK_ENV_FILE /spack_environment/spack.yaml -RUN spack env create ci /spack_environment/spack.yaml && \ +RUN spack external find --all && \ + spack env create ci /spack_environment/spack.yaml && \ spack -e ci concretize -f && \ spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies From 5423771993a2b6bec66453337d9ba61a9bd3e0a1 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 21:59:35 +0100 Subject: [PATCH 04/27] Fix CI container build args --- .cscs-ci/container/deps.Containerfile | 12 +++++------- .cscs-ci/default.yaml | 7 +++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 32920e3d..bcba848f 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,22 +1,20 @@ -FROM ghcr.io/eth-cscs/alps-images:py26.01-alps3-base - -ARG SPACK_SHA=v1.1.1 -ARG SPACK_PACKAGES_SHA=bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 -ARG SPACK_ENV_FILE - -ENV DEBIAN_FRONTEND=noninteractive +ARG BASE_IMAGE +FROM BASE_IMAGE +ARG SPACK_SHA RUN mkdir -p /opt/spack && \ curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack ENV PATH="/opt/spack/bin:$PATH" +ARG SPACK_PACKAGES_SHA RUN mkdir -p /opt/spack-packages && \ curl -Ls "https://api.github.com/repos/spack/spack-packages/tarball/$SPACK_PACKAGES_SHA" | tar --strip-components=1 -xz -C /opt/spack-packages RUN spack repo remove --scope defaults:base builtin && \ spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin +ARG SPACK_ENV_FILE COPY $SPACK_ENV_FILE /spack_environment/spack.yaml RUN spack external find --all && \ diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 959ba3d4..cc1818b4 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -7,10 +7,9 @@ stages: - test variables: - # The base image is the py26.01 alps3 image from docs.cscs.ch - BASE_IMAGE: ghcr.io/eth-cscs/alps-images:py26.01-alps3-base - SPACK_SHA: develop - SPACK_PACKAGES_SHA: main + BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 + SPACK_SHA: v1.1.1 + SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 FF_TIMESTAMPS: true .build_deps_template: From 3f15f1f207e5a5c4c73a80f267255c9fd88e54ab Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:01:56 +0100 Subject: [PATCH 05/27] Specify oomph@main in spack environments --- .cscs-ci/spack/libfabric.yaml | 2 +- .cscs-ci/spack/mpi.yaml | 2 +- .cscs-ci/spack/nccl.yaml | 2 +- .cscs-ci/spack/ucx.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml index f659f278..27fdfb08 100644 --- a/.cscs-ci/spack/libfabric.yaml +++ b/.cscs-ci/spack/libfabric.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=libfabric +cuda +python + - oomph@main backend=libfabric +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml index 696d894d..90e45ff8 100644 --- a/.cscs-ci/spack/mpi.yaml +++ b/.cscs-ci/spack/mpi.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=mpi +cuda +python + - oomph@main backend=mpi +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml index 2dc59834..4c08a383 100644 --- a/.cscs-ci/spack/nccl.yaml +++ b/.cscs-ci/spack/nccl.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=nccl +cuda +python + - oomph@main backend=nccl +cuda +python view: false concretizer: unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml index 76100e29..251a4ec9 100644 --- a/.cscs-ci/spack/ucx.yaml +++ b/.cscs-ci/spack/ucx.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph backend=ucx +cuda +python + - oomph@main backend=ucx +cuda +python view: false concretizer: unify: true From b832fe622e928f849e6a132b2ac191880eac4acb Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:02:26 +0100 Subject: [PATCH 06/27] Remove +python from spack specs --- .cscs-ci/spack/libfabric.yaml | 2 +- .cscs-ci/spack/mpi.yaml | 2 +- .cscs-ci/spack/nccl.yaml | 2 +- .cscs-ci/spack/ucx.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.cscs-ci/spack/libfabric.yaml b/.cscs-ci/spack/libfabric.yaml index 27fdfb08..fac7f88f 100644 --- a/.cscs-ci/spack/libfabric.yaml +++ b/.cscs-ci/spack/libfabric.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=libfabric +cuda +python + - oomph@main backend=libfabric +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/mpi.yaml b/.cscs-ci/spack/mpi.yaml index 90e45ff8..d59aab13 100644 --- a/.cscs-ci/spack/mpi.yaml +++ b/.cscs-ci/spack/mpi.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=mpi +cuda +python + - oomph@main backend=mpi +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/nccl.yaml b/.cscs-ci/spack/nccl.yaml index 4c08a383..94f0dd31 100644 --- a/.cscs-ci/spack/nccl.yaml +++ b/.cscs-ci/spack/nccl.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=nccl +cuda +python + - oomph@main backend=nccl +cuda view: false concretizer: unify: true diff --git a/.cscs-ci/spack/ucx.yaml b/.cscs-ci/spack/ucx.yaml index 251a4ec9..51377dd8 100644 --- a/.cscs-ci/spack/ucx.yaml +++ b/.cscs-ci/spack/ucx.yaml @@ -1,6 +1,6 @@ spack: specs: - - oomph@main backend=ucx +cuda +python + - oomph@main backend=ucx +cuda view: false concretizer: unify: true From 1e851df257cb97ba664ec392f0c472bcb9c3a615 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:10:23 +0100 Subject: [PATCH 07/27] Remove stages --- .cscs-ci/default.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index cc1818b4..c3e80df3 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -1,11 +1,6 @@ include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' -stages: - - build_deps - - build - - test - variables: BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 SPACK_SHA: v1.1.1 @@ -13,7 +8,6 @@ variables: FF_TIMESTAMPS: true .build_deps_template: - stage: build_deps timeout: 1 hours before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true @@ -63,7 +57,6 @@ build_deps_libfabric: SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml .build_template: - stage: build extends: .container-builder-cscs-gh200 timeout: 1 hours before_script: @@ -110,7 +103,6 @@ build_libfabric: BACKEND: libfabric .test_serial_template: - stage: test extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 @@ -121,7 +113,6 @@ build_libfabric: - ctest --test-dir build -L "serial" --output-on-failure .test_parallel_template: - stage: test extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 From 0398d59869d80c6a002ceb37497820bc384b50a9 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 25 Mar 2026 22:13:40 +0100 Subject: [PATCH 08/27] Refactor ci config --- .cscs-ci/default.yaml | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c3e80df3..2a0bc830 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -19,42 +19,39 @@ variables: - echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env variables: DOCKERFILE: .cscs-ci/container/deps.Containerfile - DOCKER_BUILD_ARGS: '["SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' + SPACK_ENV_FILE: .cscs./spack/$BACKEND.yaml artifacts: reports: dotenv: base-${BACKEND}.env build_deps_nccl: + variables: + BACKEND: nccl extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: nccl - SPACK_ENV_FILE: .cscs-ci/spack/nccl.yaml build_deps_mpi: + variables: + BACKEND: mpi extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: mpi - SPACK_ENV_FILE: .cscs-ci/spack/mpi.yaml build_deps_ucx: + variables: + BACKEND: ucx extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: ucx - SPACK_ENV_FILE: .cscs-ci/spack/ucx.yaml build_deps_libfabric: + variables: + BACKEND: libfabric extends: - .container-builder-cscs-gh200 - .build_deps_template - variables: - BACKEND: libfabric - SPACK_ENV_FILE: .cscs-ci/spack/libfabric.yaml .build_template: extends: .container-builder-cscs-gh200 @@ -71,36 +68,36 @@ build_deps_libfabric: dotenv: build-${BACKEND}.env build_nccl: + variables: + BACKEND: nccl extends: .build_template needs: - job: build_deps_nccl artifacts: true - variables: - BACKEND: nccl build_mpi: + variables: + BACKEND: mpi extends: .build_template needs: - job: build_deps_mpi artifacts: true - variables: - BACKEND: mpi build_ucx: + variables: + BACKEND: ucx extends: .build_template needs: - job: build_deps_ucx artifacts: true - variables: - BACKEND: ucx build_libfabric: + variables: + BACKEND: libfabric extends: .build_template needs: - job: build_deps_libfabric artifacts: true - variables: - BACKEND: libfabric .test_serial_template: extends: .container-runner-clariden-gh200 From b1297aca28222bdc614d04851e4d5d3f6bd129e8 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:13:10 +0100 Subject: [PATCH 09/27] Fix base image --- .cscs-ci/container/deps.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index bcba848f..50570529 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -1,5 +1,5 @@ ARG BASE_IMAGE -FROM BASE_IMAGE +FROM $BASE_IMAGE ARG SPACK_SHA RUN mkdir -p /opt/spack && \ From 34847222c671735ee2ec057d9fad716162bb866b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:26:27 +0100 Subject: [PATCH 10/27] Fix typo --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2a0bc830..2f3e3db2 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -2,7 +2,7 @@ include: - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' variables: - BASE_IMAGE: jfrog.svc.ccs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 + BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps3 SPACK_SHA: v1.1.1 SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25 FF_TIMESTAMPS: true From 33624ef1261b5d0ef305f77e015d6ac6d2f68212 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:31:11 +0100 Subject: [PATCH 11/27] Fix env file path --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2f3e3db2..14baa148 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -20,7 +20,7 @@ variables: variables: DOCKERFILE: .cscs-ci/container/deps.Containerfile DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]' - SPACK_ENV_FILE: .cscs./spack/$BACKEND.yaml + SPACK_ENV_FILE: .cscs-ci/spack/$BACKEND.yaml artifacts: reports: dotenv: base-${BACKEND}.env From a3e950dfbf5e5cc4340cb1af96ee3809778f4801 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 09:45:54 +0100 Subject: [PATCH 12/27] Update cmake config in CI --- .cscs-ci/container/build.Containerfile | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 784221cb..1f010a87 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -4,5 +4,16 @@ FROM $DEPS_IMAGE COPY . /oomph WORKDIR /oomph -RUN spack -e ci build-env oomph -- cmake -B build -DOOMPH_WITH_TESTING=ON -DMPIEXEC_EXECUTABLE="" -DMPIEXEC_NUMPROC_FLAG="" -DMPIEXEC_PREFLAGS="" -DMPIEXEC_POSTFLAGS="" && \ +ARG BACKEND +RUN spack -e ci build-env oomph -- \ + cmake -G Ninja -B build \ + -DOOMPH_WITH_TESTING=ON \ + # Converte BACKEND to uppercase + -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \ + -DOOMPH_USE_BUNDLED_LIBS=ON \ + -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \ + -DMPIEXEC_EXECUTABLE="" \ + -DMPIEXEC_NUMPROC_FLAG="" \ + -DMPIEXEC_PREFLAGS="" \ + -DMPIEXEC_POSTFLAGS="" && \ spack -e ci build-env oomph -- cmake --build build -j$(nproc) From 9871e880437958381c6d0280aeee276f66f5a760 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 10:18:52 +0100 Subject: [PATCH 13/27] Use NUM_PROCS instead of nproc --- .cscs-ci/container/build.Containerfile | 3 ++- .cscs-ci/container/deps.Containerfile | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 1f010a87..c16ce28d 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -5,6 +5,7 @@ COPY . /oomph WORKDIR /oomph ARG BACKEND +ARG NUM_PROCS RUN spack -e ci build-env oomph -- \ cmake -G Ninja -B build \ -DOOMPH_WITH_TESTING=ON \ @@ -16,4 +17,4 @@ RUN spack -e ci build-env oomph -- \ -DMPIEXEC_NUMPROC_FLAG="" \ -DMPIEXEC_PREFLAGS="" \ -DMPIEXEC_POSTFLAGS="" && \ - spack -e ci build-env oomph -- cmake --build build -j$(nproc) + spack -e ci build-env oomph -- cmake --build build $NUM_PROCS diff --git a/.cscs-ci/container/deps.Containerfile b/.cscs-ci/container/deps.Containerfile index 50570529..5fc530bd 100644 --- a/.cscs-ci/container/deps.Containerfile +++ b/.cscs-ci/container/deps.Containerfile @@ -17,7 +17,8 @@ RUN spack repo remove --scope defaults:base builtin && \ ARG SPACK_ENV_FILE COPY $SPACK_ENV_FILE /spack_environment/spack.yaml +ARG NUM_PROCS RUN spack external find --all && \ spack env create ci /spack_environment/spack.yaml && \ spack -e ci concretize -f && \ - spack -e ci install --jobs $(nproc) --fail-fast --only=dependencies + spack -e ci install --jobs $NUM_PROCS --fail-fast --only=dependencies From 177592ddd10994449c066a517852f1a94295798c Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 10:47:21 +0100 Subject: [PATCH 14/27] Fix num procs --- .cscs-ci/container/build.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index c16ce28d..66a8ae69 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -17,4 +17,4 @@ RUN spack -e ci build-env oomph -- \ -DMPIEXEC_NUMPROC_FLAG="" \ -DMPIEXEC_PREFLAGS="" \ -DMPIEXEC_POSTFLAGS="" && \ - spack -e ci build-env oomph -- cmake --build build $NUM_PROCS + spack -e ci build-env oomph -- cmake --build build -j$NUM_PROCS From 110d4eb869544c8d1e2efda08083a8f80443fe47 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:01:55 +0100 Subject: [PATCH 15/27] Update test job config --- .cscs-ci/default.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 14baa148..65467789 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -106,8 +106,9 @@ build_libfabric: SLURM_NTASKS: 1 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal + SLURM_NETWORK=disable_rdzv_get script: - - ctest --test-dir build -L "serial" --output-on-failure + - ctest --test-dir /oomph/build -L "serial" --output-on-failure .test_parallel_template: extends: .container-runner-clariden-gh200 @@ -117,9 +118,10 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_MPI: pmix + SLURM_NETWORK=disable_rdzv_get MPICH_GPU_SUPPORT_ENABLED: 1 script: - - srun -n 4 ctest --test-dir build -L "parallel-ranks-4" --output-on-failure + - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure test_serial_nccl: extends: .test_serial_template From 95183544bfbe219d59d1e1eb0b7337c5e7acfd8b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:04:31 +0100 Subject: [PATCH 16/27] Fix syntax --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 65467789..3b649519 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -106,7 +106,7 @@ build_libfabric: SLURM_NTASKS: 1 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal - SLURM_NETWORK=disable_rdzv_get + SLURM_NETWORK: disable_rdzv_get script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -118,7 +118,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_MPI: pmix - SLURM_NETWORK=disable_rdzv_get + SLURM_NETWORK: disable_rdzv_get MPICH_GPU_SUPPORT_ENABLED: 1 script: - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure From 5419c5be656be431f5658ebd3072bf0990869cd2 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 13:39:34 +0100 Subject: [PATCH 17/27] Fix parallel testing --- .cscs-ci/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 3b649519..c09ce896 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -119,9 +119,10 @@ build_libfabric: SLURM_PARTITION: normal SLURM_MPI: pmix SLURM_NETWORK: disable_rdzv_get + SLURM_LABELIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - srun -n 4 ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure test_serial_nccl: extends: .test_serial_template From ffd02a45ce35f0da1ba76d38001415fee5b070b2 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:22:49 +0100 Subject: [PATCH 18/27] Explicitly ask for one gpu per task --- .cscs-ci/default.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index c09ce896..55780273 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -114,6 +114,7 @@ build_libfabric: extends: .container-runner-clariden-gh200 variables: SLURM_JOB_NUM_NODES: 1 + SLURM_GPUS_PER_TASK: 1 SLURM_NTASKS: 4 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal From a81f37f4b47d0f5e3e338c1c5c5e7417aad838e4 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:22:58 +0100 Subject: [PATCH 19/27] Verbose ctest output --- .cscs-ci/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 55780273..bcab0e30 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -123,7 +123,7 @@ build_libfabric: SLURM_LABELIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose test_serial_nccl: extends: .test_serial_template From 7b35569ffdc33c346d4ea0578dd8689990babb21 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:48:05 +0100 Subject: [PATCH 20/27] Explicitly set debug build for CI --- .cscs-ci/container/build.Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cscs-ci/container/build.Containerfile b/.cscs-ci/container/build.Containerfile index 66a8ae69..fe3e707f 100644 --- a/.cscs-ci/container/build.Containerfile +++ b/.cscs-ci/container/build.Containerfile @@ -8,8 +8,8 @@ ARG BACKEND ARG NUM_PROCS RUN spack -e ci build-env oomph -- \ cmake -G Ninja -B build \ + -DCMAKE_BUILD_TYPE=Debug \ -DOOMPH_WITH_TESTING=ON \ - # Converte BACKEND to uppercase -DOOMPH_WITH_$(echo $BACKEND | tr '[:lower:]' '[:upper:]')=ON \ -DOOMPH_USE_BUNDLED_LIBS=ON \ -DOOMPH_USE_BUNDLED_HWMALLOC=OFF \ From 60e0e25cf9f1c3b3df23fb097184bc5c3af3a67f Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 14:50:50 +0100 Subject: [PATCH 21/27] Don't set any mpiexec options if MPIEXEC_EXECUTABLE is empty --- test/CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 31fea066..e645a636 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -62,10 +62,15 @@ function(reg_parallel_test t_ lib n) oomph_target_compile_options(${t}) target_link_libraries(${t} PRIVATE gtest_main_mpi) target_link_libraries(${t} PRIVATE oomph_${lib}) - add_test( - NAME ${t} - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} - $ ${MPIEXEC_POSTFLAGS}) + # If not empty + if("${MPIEXEC_EXECUTABLE}" STREQUAL "") + add_test(NAME ${t} COMMAND $) + else() + add_test( + NAME ${t} + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${n} ${MPIEXEC_PREFLAGS} + $ ${MPIEXEC_POSTFLAGS}) + endif() set_tests_properties(${t} PROPERTIES RUN_SERIAL TRUE LABELS "parallel-ranks-${n}") endfunction() From c3ea5689c8a4e1c3ea7ce06d0aea3a8500749e01 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:37:53 +0100 Subject: [PATCH 22/27] Don't buffer test output --- .cscs-ci/default.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index bcab0e30..3b93bf25 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -107,6 +107,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get + SLURM_UNBUFFERED: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -121,6 +122,7 @@ build_libfabric: SLURM_MPI: pmix SLURM_NETWORK: disable_rdzv_get SLURM_LABELIO: 1 + SLURM_UNBUFFERED: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose From 521011e320f203e8cfc5427f24d00360fefa9ca3 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:38:18 +0100 Subject: [PATCH 23/27] Skip cancel test --- .cscs-ci/default.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 3b93bf25..7110ce50 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -125,7 +125,8 @@ build_libfabric: SLURM_UNBUFFERED: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose + # TODO: test_cancel hanging? + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel test_serial_nccl: extends: .test_serial_template From abb418899d225730b6394df8c9fe3a9327f26466 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:46:42 +0100 Subject: [PATCH 24/27] Fix slurm variables --- .cscs-ci/default.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 7110ce50..9f2b60d7 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -107,7 +107,7 @@ build_libfabric: SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get - SLURM_UNBUFFERED: 1 + SLURM_UNBUFFEREDIO: 1 script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -119,10 +119,10 @@ build_libfabric: SLURM_NTASKS: 4 SLURM_TIMELIMIT: '00:15:00' SLURM_PARTITION: normal - SLURM_MPI: pmix + SLURM_MPI_TYPE: pmix SLURM_NETWORK: disable_rdzv_get SLURM_LABELIO: 1 - SLURM_UNBUFFERED: 1 + SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 script: # TODO: test_cancel hanging? From 3689ebee5882b7e08f81c51bc508245ee61f72ad Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:49:39 +0100 Subject: [PATCH 25/27] Shorten timeouts --- .cscs-ci/default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 9f2b60d7..2d831960 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -55,7 +55,7 @@ build_deps_libfabric: .build_template: extends: .container-builder-cscs-gh200 - timeout: 1 hours + timeout: 15 minutes before_script: - echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/oomph-build-$BACKEND:$CI_COMMIT_SHA @@ -117,7 +117,7 @@ build_libfabric: SLURM_JOB_NUM_NODES: 1 SLURM_GPUS_PER_TASK: 1 SLURM_NTASKS: 4 - SLURM_TIMELIMIT: '00:15:00' + SLURM_TIMELIMIT: '5:00' SLURM_PARTITION: normal SLURM_MPI_TYPE: pmix SLURM_NETWORK: disable_rdzv_get From 841d97bc758a1fe532b03700f17ae118a66ae55b Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 16:56:37 +0100 Subject: [PATCH 26/27] Don't load cxi hooks in CI --- .cscs-ci/default.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 2d831960..0f48134c 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -108,6 +108,7 @@ build_libfabric: SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get SLURM_UNBUFFEREDIO: 1 + USE_MPI: NO script: - ctest --test-dir /oomph/build -L "serial" --output-on-failure @@ -124,6 +125,7 @@ build_libfabric: SLURM_LABELIO: 1 SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 + USE_MPI: NO script: # TODO: test_cancel hanging? - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel From d7995af0c2fe34da458e3192434050ed06a79464 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 26 Mar 2026 17:05:17 +0100 Subject: [PATCH 27/27] Update slurm and ctest options --- .cscs-ci/default.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.cscs-ci/default.yaml b/.cscs-ci/default.yaml index 0f48134c..98ccacff 100644 --- a/.cscs-ci/default.yaml +++ b/.cscs-ci/default.yaml @@ -108,9 +108,11 @@ build_libfabric: SLURM_PARTITION: normal SLURM_NETWORK: disable_rdzv_get SLURM_UNBUFFEREDIO: 1 + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" USE_MPI: NO script: - - ctest --test-dir /oomph/build -L "serial" --output-on-failure + - ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 .test_parallel_template: extends: .container-runner-clariden-gh200 @@ -125,10 +127,12 @@ build_libfabric: SLURM_LABELIO: 1 SLURM_UNBUFFEREDIO: 1 MPICH_GPU_SUPPORT_ENABLED: 1 + PMIX_MCA_psec: native + PMIX_MCA_gds: "^shmem2" USE_MPI: NO script: # TODO: test_cancel hanging? - - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose -E test_cancel + - ctest --test-dir /oomph/build -L "parallel-ranks-4" --verbose --timeout 60 test_serial_nccl: extends: .test_serial_template