From 1fd638925fdc6caf240998d55546a45493ed8027 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 28 Jan 2026 14:36:01 +0100
Subject: [PATCH 01/68] Attempt to add cuda support to distributed ci pipeline

---
 ci/distributed.yml                |  3 ++-
 ci/docker/base_mpi.Dockerfile     | 36 ++++++++++++++++---------------
 ci/docker/checkout_mpi.Dockerfile |  2 +-
 scripts/ci-mpi-wrapper.sh         |  2 ++
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 8b173b22b0..d8f8b9e920 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -80,7 +80,8 @@ build_distributed_cpu:
   parallel:
     matrix:
       - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
-        BACKEND: [embedded, gtfn_cpu, dace_cpu]
+        # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu]
+        BACKEND: [dace_cpu, dace_gpu]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'
       variables:
diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index 3fcdb21297..914b556136 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -4,23 +4,25 @@ ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 
 ARG DEBIAN_FRONTEND=noninteractive
-RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
-    strace \
-    build-essential \
-    tar \
-    wget \
-    curl \
-    libboost-dev \
-    libnuma-dev \
-    libopenmpi-dev \
-    ca-certificates \
-    libssl-dev \
-    autoconf \
-    automake \
-    libtool \
-    pkg-config \
-    libreadline-dev \
-    git && \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        autoconf \
+        automake \
+        build-essential \
+        ca-certificates \
+        curl \
+        git \
+        libboost-dev \
+        libnuma-dev \
+        libopenmpi-dev \
+        libreadline-dev \
+        libssl-dev \
+        libtool \
+        nvidia-cuda-dev \
+        pkg-config \
+        strace \
+        tar \
+        wget && \
     rm -rf /var/lib/apt/lists/*
 
 # Install uv: https://docs.astral.sh/uv/guides/integration/docker
diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile
index c229d6c374..62ea5daeae 100644
--- a/ci/docker/checkout_mpi.Dockerfile
+++ b/ci/docker/checkout_mpi.Dockerfile
@@ -8,4 +8,4 @@ ARG PYVERSION
 ARG VENV
 ENV UV_PROJECT_ENVIRONMENT=$VENV
 ENV MPI4PY_BUILD_BACKEND="scikit-build-core"
-RUN uv sync --extra distributed --python=$PYVERSION
+RUN uv sync --extra all --python=$PYVERSION
diff --git a/scripts/ci-mpi-wrapper.sh b/scripts/ci-mpi-wrapper.sh
index 900dd340ae..c0aa25d41f 100755
--- a/scripts/ci-mpi-wrapper.sh
+++ b/scripts/ci-mpi-wrapper.sh
@@ -17,6 +17,8 @@ else
     exit 1
 fi
 
+export CUDA_VISIBLE_DEVICES="${rank}"
+
 log_file="${CI_PROJECT_DIR:+${CI_PROJECT_DIR}/}pytest-log-rank-${rank}.txt"
 
 if [[ "${rank}" -eq 0 ]]; then

From cbb1891e84a85b316a550b81d497021313244190 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 28 Jan 2026 17:03:44 +0100
Subject: [PATCH 02/68] Add cuda12 extra

---
 ci/docker/checkout_mpi.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile
index 62ea5daeae..4cbf1d32c0 100644
--- a/ci/docker/checkout_mpi.Dockerfile
+++ b/ci/docker/checkout_mpi.Dockerfile
@@ -8,4 +8,4 @@ ARG PYVERSION
 ARG VENV
 ENV UV_PROJECT_ENVIRONMENT=$VENV
 ENV MPI4PY_BUILD_BACKEND="scikit-build-core"
-RUN uv sync --extra all --python=$PYVERSION
+RUN uv sync --extra all --extra cuda12 --python=$PYVERSION

From bbb151cbef4a93e65a6ccb451bc9f24e10dfd36c Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 28 Jan 2026 18:02:02 +0100
Subject: [PATCH 03/68] Add nvidia-cuda-toolkit

---
 ci/docker/base_mpi.Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index 914b556136..92cb700e22 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -19,6 +19,7 @@ RUN apt-get update && \
         libssl-dev \
         libtool \
         nvidia-cuda-dev \
+        nvidia-cuda-toolkit \
         pkg-config \
         strace \
         tar \

From b9be7fb076c60cc495ce92aa8719b2d2032269a3 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 29 Jan 2026 13:02:09 +0100
Subject: [PATCH 04/68] Revert "refactor: testing infrastructure (#1002)"

This reverts commit e30c2f71e668952698fd93e3ce1a1c054029ea6c.
---
 .../model/common/utils/device_utils.py        |  3 ---
 .../icon4py/model/testing/data_handling.py    | 23 +++--------------
 .../model/testing/fixtures/datatest.py        | 25 +++++++++++++++++--
 .../icon4py/model/testing/stencil_tests.py    | 17 ++++++-------
 4 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/model/common/src/icon4py/model/common/utils/device_utils.py b/model/common/src/icon4py/model/common/utils/device_utils.py
index 360a53902a..cacfc8eb64 100644
--- a/model/common/src/icon4py/model/common/utils/device_utils.py
+++ b/model/common/src/icon4py/model/common/utils/device_utils.py
@@ -37,9 +37,6 @@ def sync(allocator: gtx_typing.FieldBufferAllocationUtil | None = None) -> None:
 
     Note: this is and ad-hoc interface, maybe the function should get the device to sync for.
     """
-    # Type annotation already describes that only these types are allowed, but mypy coverage is not great.
-    # The explicit assert avoids critical mistakes in using this function.
-    assert allocator is None or gtx_allocators.is_field_allocation_tool(allocator)
     if allocator is not None and is_cupy_device(allocator):
         cp.cuda.runtime.deviceSynchronize()
 
diff --git a/model/testing/src/icon4py/model/testing/data_handling.py b/model/testing/src/icon4py/model/testing/data_handling.py
index 9624c64839..9ecf932335 100644
--- a/model/testing/src/icon4py/model/testing/data_handling.py
+++ b/model/testing/src/icon4py/model/testing/data_handling.py
@@ -6,13 +6,11 @@
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
 
-import pathlib
 import tarfile
+from pathlib import Path
 
-from icon4py.model.testing import config, locking
 
-
-def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "downloaded.tar.gz") -> None:
+def download_and_extract(uri: str, dst: Path, data_file: str = "downloaded.tar.gz") -> None:
     """
     Download data archive from remote server.
 
@@ -33,19 +31,4 @@ def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "download
         raise OSError(f"{data_file} needs to be a valid tar file")
     with tarfile.open(data_file, mode="r:*") as tf:
         tf.extractall(path=dst)
-    pathlib.Path(data_file).unlink(missing_ok=True)
-
-
-def download_test_data(dst: pathlib.Path, uri: str) -> None:
-    if config.ENABLE_TESTDATA_DOWNLOAD:
-        # We create and lock the *parent* directory as we later check for existence of `dst`.
-        dst.parent.mkdir(parents=True, exist_ok=True)
-        with locking.lock(dst.parent):
-            if not dst.exists():
-                download_and_extract(uri, dst)
-    else:
-        # If test data download is disabled, we check if the directory exists
-        # without locking. We assume the location is managed by the user
-        # and avoid locking shared directories (e.g. on CI).
-        if not dst.exists():
-            raise RuntimeError(f"Test data {dst} does not exist, and downloading is disabled.")
+    Path(data_file).unlink(missing_ok=True)
diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
index c1d17332e9..28483172a1 100644
--- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py
+++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
@@ -17,7 +17,13 @@
 from icon4py.model.common import model_backends, model_options
 from icon4py.model.common.constants import RayleighType
 from icon4py.model.common.grid import base as base_grid
-from icon4py.model.testing import data_handling as data, datatest_utils as dt_utils, definitions
+from icon4py.model.testing import (
+    config,
+    data_handling as data,
+    datatest_utils as dt_utils,
+    definitions,
+    locking,
+)
 
 
 if TYPE_CHECKING:
@@ -119,7 +125,22 @@ def _download_ser_data(
     try:
         destination_path = dt_utils.get_datapath_for_experiment(_ranked_data_path, _experiment)
         uri = _experiment.partitioned_data[comm_size]
-        data.download_test_data(destination_path, uri)
+
+        data_file = _ranked_data_path.joinpath(f"{_experiment.name}_mpitask{comm_size}.tar.gz").name
+        _ranked_data_path.mkdir(parents=True, exist_ok=True)
+        if config.ENABLE_TESTDATA_DOWNLOAD:
+            with locking.lock(_ranked_data_path):
+                # Note: if the lock would be created for `destination_path` it would always exist...
+                if not destination_path.exists():
+                    data.download_and_extract(uri, _ranked_data_path, data_file)
+        else:
+            # If test data download is disabled, we check if the directory exists
+            # without locking. We assume the location is managed by the user
+            # and avoid locking shared directories (e.g. on CI).
+            if not destination_path.exists():
+                raise RuntimeError(
+                    f"Serialization data {data_file} does not exist, and downloading is disabled."
+                )
     except KeyError as err:
         raise RuntimeError(
             f"No data for communicator of size {comm_size} exists, use 1, 2 or 4"
diff --git a/model/testing/src/icon4py/model/testing/stencil_tests.py b/model/testing/src/icon4py/model/testing/stencil_tests.py
index ad1bf5e0ac..f83798f029 100644
--- a/model/testing/src/icon4py/model/testing/stencil_tests.py
+++ b/model/testing/src/icon4py/model/testing/stencil_tests.py
@@ -21,7 +21,6 @@
     config as gtx_config,
     constructors,
     metrics as gtx_metrics,
-    named_collections as gtx_named_collections,
     typing as gtx_typing,
 )
 
@@ -35,15 +34,13 @@
 
 def allocate_data(
     allocator: gtx_typing.FieldBufferAllocationUtil | None,
-    input_data: dict[
-        str, Any
-    ],  # `Field`s or collection of `Field`s are re-allocated, the rest is passed through
-) -> dict[str, Any]:
-    def _allocate_field(f: gtx.Field) -> gtx.Field:
-        return constructors.as_field(domain=f.domain, data=f.ndarray, allocator=allocator)
-
+    input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]],
+) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]:
+    _allocate_field = constructors.as_field.partial(allocator=allocator)  # type:ignore[attr-defined] # TODO(havogt): check why it doesn't understand the fluid_partial
     input_data = {
-        k: gtx_named_collections.tree_map_named_collection(_allocate_field)(v)
+        k: tuple(_allocate_field(domain=field.domain, data=field.ndarray) for field in v)
+        if isinstance(v, tuple)
+        else _allocate_field(domain=v.domain, data=v.ndarray)
         if not gtx.is_scalar_type(v) and k != "domain"
         else v
         for k, v in input_data.items()
@@ -210,7 +207,7 @@ def _properly_allocated_input_data(
         self,
         input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]],
         backend_like: model_backends.BackendLike,
-    ) -> dict[str, Any]:
+    ) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]:
         # TODO(havogt): this is a workaround,
         # because in the `input_data` fixture provided by the user
         # it does not allocate for the correct device.

From 731283a76200caadf5ca5c19ac68c26c79949ef5 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 29 Jan 2026 14:00:57 +0100
Subject: [PATCH 05/68] Use cxi hook in ci

---
 ci/distributed.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index d8f8b9e920..00953956a1 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -38,7 +38,7 @@ build_distributed_baseimage_aarch64:
     DOCKERFILE: ci/docker/checkout_mpi.Dockerfile
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-    USE_MPI: NO
+    USE_MPI: YES
     SLURM_MPI_TYPE: pmix
     PMIX_MCA_psec: native
     PMIX_MCA_gds: "^shmem2"

From ea2b3aa7bbfddde4b32bee52ac857b999fec5884 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 29 Jan 2026 14:24:53 +0100
Subject: [PATCH 06/68] Try mpich

---
 ci/distributed.yml            | 7 ++++---
 ci/docker/base_mpi.Dockerfile | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 00953956a1..5f58839466 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -39,9 +39,10 @@ build_distributed_baseimage_aarch64:
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
     USE_MPI: YES
-    SLURM_MPI_TYPE: pmix
-    PMIX_MCA_psec: native
-    PMIX_MCA_gds: "^shmem2"
+    SLURM_MPI_TYPE: pmi2
+    # SLURM_MPI_TYPE: pmix
+    # PMIX_MCA_psec: native
+    # PMIX_MCA_gds: "^shmem2"
 
 .build_distributed_cpu:
   extends: [.build_distributed_template]
diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index 92cb700e22..d7c6b379c5 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -13,8 +13,8 @@ RUN apt-get update && \
         curl \
         git \
         libboost-dev \
+        libmpich-dev \
         libnuma-dev \
-        libopenmpi-dev \
         libreadline-dev \
         libssl-dev \
         libtool \

From 8f04d362b80a7f07ade11bf9ebf48f951b5c9f5c Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 29 Jan 2026 14:25:13 +0100
Subject: [PATCH 07/68] Reduce tests

---
 ci/distributed.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 5f58839466..7f75ebc63b 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -80,7 +80,8 @@ build_distributed_cpu:
     - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
   parallel:
     matrix:
-      - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
+      # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
+      - COMPONENT: [common]
         # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu]
         BACKEND: [dace_cpu, dace_gpu]
   rules:

From 9f96b70edce78ffefdda8cd00b82ee7a886fcd43 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 29 Jan 2026 18:48:31 +0100
Subject: [PATCH 08/68] Try using manually built openmpi

---
 ci/distributed.yml            |  9 ++---
 ci/docker/base_mpi.Dockerfile | 75 ++++++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 7f75ebc63b..4d4d518b58 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -38,11 +38,10 @@ build_distributed_baseimage_aarch64:
     DOCKERFILE: ci/docker/checkout_mpi.Dockerfile
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-    USE_MPI: YES
-    SLURM_MPI_TYPE: pmi2
-    # SLURM_MPI_TYPE: pmix
-    # PMIX_MCA_psec: native
-    # PMIX_MCA_gds: "^shmem2"
+    USE_MPI: NO
+    SLURM_MPI_TYPE: pmix
+    PMIX_MCA_psec: native
+    PMIX_MCA_gds: "^shmem2"
 
 .build_distributed_cpu:
   extends: [.build_distributed_template]
diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index d7c6b379c5..bc18fd95fe 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -13,18 +13,91 @@ RUN apt-get update && \
         curl \
         git \
         libboost-dev \
-        libmpich-dev \
+        libconfig-dev \
+        libcurl4-openssl-dev \
+        libfuse-dev \
+        libjson-c-dev \
+        libnl-3-dev \
         libnuma-dev \
         libreadline-dev \
+        libsensors-dev \
         libssl-dev \
         libtool \
+        libuv1-dev \
+        libyaml-dev \
         nvidia-cuda-dev \
         nvidia-cuda-toolkit \
         pkg-config \
+        python3 \
         strace \
         tar \
         wget && \
     rm -rf /var/lib/apt/lists/*
 
+# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps.
+ARG gdrcopy_version=2.5.1
+RUN set -eux; \
+    git clone --depth 1 --branch "v${gdrcopy_version}" https://github.com/NVIDIA/gdrcopy.git; \
+    cd gdrcopy; \
+    make lib -j"$(nproc)" lib_install; \
+    cd /; \
+    rm -rf /gdrcopy; \
+    ldconfig
+
+ARG cassini_headers_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${cassini_headers_version}" https://github.com/HewlettPackard/shs-cassini-headers.git; \
+    cd shs-cassini-headers; \
+    cp -r include/* /usr/include/; \
+    cp -r share/* /usr/share/; \
+    rm -rf /shs-cassini-headers
+
+ARG cxi_driver_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${cxi_driver_version}" https://github.com/HewlettPackard/shs-cxi-driver.git; \
+    cd shs-cxi-driver; \
+    cp -r include/* /usr/include/; \
+    rm -rf /shs-cxi-driver
+
+ARG libcxi_version=release/shs-13.0.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${libcxi_version}" https://github.com/HewlettPackard/shs-libcxi.git; \
+    cd shs-libcxi; \
+    ./autogen.sh; \
+    ./configure \
+      --with-cuda; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /shs-libcxi; \
+    ldconfig
+
+ARG libfabric_version=v2.4.0
+RUN set -eux; \
+    git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \
+    cd libfabric; \
+    ./autogen.sh; \
+    ./configure \
+      --with-cuda \
+      --enable-cuda-dlopen \
+      --enable-gdrcopy-dlopen \
+      --enable-cxi; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /libfabric; \
+    ldconfig
+
+ARG openmpi_version=5.0.9
+RUN set -eux; \
+    curl -fsSL "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${openmpi_version}.tar.gz" -o /tmp/ompi.tar.gz; \
+    tar -C /tmp -xzf /tmp/ompi.tar.gz; \
+    cd "/tmp/openmpi-${openmpi_version}"; \
+    ./configure \
+      --with-ofi \
+      --with-cuda=/usr; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf "/tmp/openmpi-${openmpi_version}" /tmp/ompi.tar.gz; \
+    ldconfig
+
 # Install uv: https://docs.astral.sh/uv/guides/integration/docker
 COPY --from=ghcr.io/astral-sh/uv:0.9.24@sha256:816fdce3387ed2142e37d2e56e1b1b97ccc1ea87731ba199dc8a25c04e4997c5 /uv /uvx /bin/

From 9fce9b55efbda6dbe3ea10996bb54b96f8569d81 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 13:01:12 +0100
Subject: [PATCH 09/68] Debugging

---
 ci/distributed.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 4d4d518b58..9d545192cc 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -76,6 +76,8 @@ build_distributed_cpu:
     - source ${UV_PROJECT_ENVIRONMENT}/bin/activate
     - echo "running with $(python --version)"
   script:
+    - printenv
+    - echo USE_MPI=\${USE_MPI}
     - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
   parallel:
     matrix:

From c6a767ed9a1a1ec893fc3943f5f4a2e686f6bee9 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 15:22:32 +0100
Subject: [PATCH 10/68] Remove debug prints

---
 ci/distributed.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 9d545192cc..4d4d518b58 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -76,8 +76,6 @@ build_distributed_cpu:
     - source ${UV_PROJECT_ENVIRONMENT}/bin/activate
     - echo "running with $(python --version)"
   script:
-    - printenv
-    - echo USE_MPI=\${USE_MPI}
     - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
   parallel:
     matrix:

From adb1ee6fda08bb5cda894c49634eb1a63656679a Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 15:28:18 +0100
Subject: [PATCH 11/68] Unrevert test download changes

---
 .../icon4py/model/common/utils/device_utils.py  |  3 +++
 .../src/icon4py/model/testing/data_handling.py  |  5 +++--
 .../icon4py/model/testing/fixtures/datatest.py  |  8 +-------
 .../src/icon4py/model/testing/stencil_tests.py  | 17 ++++++++++-------
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/model/common/src/icon4py/model/common/utils/device_utils.py b/model/common/src/icon4py/model/common/utils/device_utils.py
index cacfc8eb64..360a53902a 100644
--- a/model/common/src/icon4py/model/common/utils/device_utils.py
+++ b/model/common/src/icon4py/model/common/utils/device_utils.py
@@ -37,6 +37,9 @@ def sync(allocator: gtx_typing.FieldBufferAllocationUtil | None = None) -> None:
 
     Note: this is and ad-hoc interface, maybe the function should get the device to sync for.
     """
+    # Type annotation already describes that only these types are allowed, but mypy coverage is not great.
+    # The explicit assert avoids critical mistakes in using this function.
+    assert allocator is None or gtx_allocators.is_field_allocation_tool(allocator)
     if allocator is not None and is_cupy_device(allocator):
         cp.cuda.runtime.deviceSynchronize()
 
diff --git a/model/testing/src/icon4py/model/testing/data_handling.py b/model/testing/src/icon4py/model/testing/data_handling.py
index c490c8981b..95bc8b8369 100644
--- a/model/testing/src/icon4py/model/testing/data_handling.py
+++ b/model/testing/src/icon4py/model/testing/data_handling.py
@@ -9,10 +9,11 @@
 import os
 import pathlib
 import tarfile
-from pathlib import Path
 
+from icon4py.model.testing import config, locking
 
-def download_and_extract(uri: str, dst: Path, data_file: str = "downloaded.tar.gz") -> None:
+
+def download_and_extract(uri: str, dst: pathlib.Path, data_file: str = "downloaded.tar.gz") -> None:
     """
     Download data archive from remote server.
 
diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
index 057235b1eb..0727c962ed 100644
--- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py
+++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
@@ -17,13 +17,7 @@
 from icon4py.model.common import model_backends, model_options
 from icon4py.model.common.constants import RayleighType
 from icon4py.model.common.grid import base as base_grid
-from icon4py.model.testing import (
-    config,
-    data_handling as data,
-    datatest_utils as dt_utils,
-    definitions,
-    locking,
-)
+from icon4py.model.testing import data_handling as data, datatest_utils as dt_utils, definitions
 
 
 if TYPE_CHECKING:
diff --git a/model/testing/src/icon4py/model/testing/stencil_tests.py b/model/testing/src/icon4py/model/testing/stencil_tests.py
index f83798f029..ad1bf5e0ac 100644
--- a/model/testing/src/icon4py/model/testing/stencil_tests.py
+++ b/model/testing/src/icon4py/model/testing/stencil_tests.py
@@ -21,6 +21,7 @@
     config as gtx_config,
     constructors,
     metrics as gtx_metrics,
+    named_collections as gtx_named_collections,
     typing as gtx_typing,
 )
 
@@ -34,13 +35,15 @@
 
 def allocate_data(
     allocator: gtx_typing.FieldBufferAllocationUtil | None,
-    input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]],
-) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]:
-    _allocate_field = constructors.as_field.partial(allocator=allocator)  # type:ignore[attr-defined] # TODO(havogt): check why it doesn't understand the fluid_partial
+    input_data: dict[
+        str, Any
+    ],  # `Field`s or collection of `Field`s are re-allocated, the rest is passed through
+) -> dict[str, Any]:
+    def _allocate_field(f: gtx.Field) -> gtx.Field:
+        return constructors.as_field(domain=f.domain, data=f.ndarray, allocator=allocator)
+
     input_data = {
-        k: tuple(_allocate_field(domain=field.domain, data=field.ndarray) for field in v)
-        if isinstance(v, tuple)
-        else _allocate_field(domain=v.domain, data=v.ndarray)
+        k: gtx_named_collections.tree_map_named_collection(_allocate_field)(v)
         if not gtx.is_scalar_type(v) and k != "domain"
         else v
         for k, v in input_data.items()
@@ -207,7 +210,7 @@ def _properly_allocated_input_data(
         self,
         input_data: dict[str, gtx.Field | tuple[gtx.Field, ...]],
         backend_like: model_backends.BackendLike,
-    ) -> dict[str, gtx.Field | tuple[gtx.Field, ...]]:
+    ) -> dict[str, Any]:
         # TODO(havogt): this is a workaround,
         # because in the `input_data` fixture provided by the user
         # it does not allocate for the correct device.

From b0321e77e07460784e93008beaeff3bc4fcffc64 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 15:43:10 +0100
Subject: [PATCH 12/68] Numpy/cupy issues

Make revert_repeated_index_to_invalid numpy-only as it's not usefully vectorized
---
 model/common/src/icon4py/model/common/grid/utils.py  | 10 +++++-----
 model/testing/src/icon4py/model/testing/serialbox.py |  7 ++-----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/model/common/src/icon4py/model/common/grid/utils.py b/model/common/src/icon4py/model/common/grid/utils.py
index 39b48c9dd5..4af7b0a6ba 100644
--- a/model/common/src/icon4py/model/common/grid/utils.py
+++ b/model/common/src/icon4py/model/common/grid/utils.py
@@ -12,14 +12,14 @@
 from icon4py.model.common.grid import gridfile
 
 
-def revert_repeated_index_to_invalid(offset: np.ndarray, array_ns: ModuleType):
+def revert_repeated_index_to_invalid(offset: np.ndarray):
     num_elements = offset.shape[0]
     for i in range(num_elements):
         # convert repeated indices back into -1
-        for val in array_ns.flip(offset[i, :]):
-            if array_ns.count_nonzero(val == offset[i, :]) > 1:
-                unique_values, counts = array_ns.unique(offset[i, :], return_counts=True)
+        for val in np.flip(offset[i, :]):
+            if np.count_nonzero(val == offset[i, :]) > 1:
+                unique_values, counts = np.unique(offset[i, :], return_counts=True)
                 rep_values = unique_values[counts > 1]
-                rep_indices = array_ns.where(array_ns.isin(offset[i, :], rep_values))[0]
+                rep_indices = np.where(np.isin(offset[i, :], rep_values))[0]
                 offset[i, rep_indices[1:]] = gridfile.GridFile.INVALID_INDEX
     return offset
diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py
index be4edf41dd..05a3fc53fe 100644
--- a/model/testing/src/icon4py/model/testing/serialbox.py
+++ b/model/testing/src/icon4py/model/testing/serialbox.py
@@ -72,7 +72,7 @@ def wrapper(self, *args, **kwargs):
                         # as a workaround for the lack of support for optional fields in gt4py.
                         shp = (1,) * len(dims)
                         return gtx.as_field(
-                            dims, np.zeros(shp, dtype=dtype), allocator=self.backend
+                            dims, self.xp.zeros(shp, dtype=dtype), allocator=self.backend
                         )
                     else:
                         return None
@@ -503,10 +503,7 @@ def construct_icon_grid(
             def potentially_revert_icon_index_transformation(ar):
                 return ar
         else:
-            potentially_revert_icon_index_transformation = functools.partial(
-                grid_utils.revert_repeated_index_to_invalid,
-                array_ns=data_alloc.import_array_ns(backend),
-            )
+            potentially_revert_icon_index_transformation = grid_utils.revert_repeated_index_to_invalid
 
         c2e2c = self.c2e2c()
         e2c2e = potentially_revert_icon_index_transformation(self.e2c2e())

From c62979c718f488c719a59bb997ea56584adbd684 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 16:32:43 +0100
Subject: [PATCH 13/68] Enable shm, lnx, xpmem support in libfabric

---
 ci/docker/base_mpi.Dockerfile | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index bc18fd95fe..6d00d12db9 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -71,6 +71,19 @@ RUN set -eux; \
     rm -rf /shs-libcxi; \
     ldconfig
 
+ARG xpmem_version=0d0bad4e1d07b38d53ecc8f20786bb1328c446da
+RUN set -eux; \
+    git clone https://github.com/hpc/xpmem.git; \
+    cd xpmem; \
+    git checkout "${xpmem_version}"; \
+    ./autogen.sh; \
+    ./configure --disable-kernel-module; \
+    make -j"$(nproc)" install; \
+    cd /; \
+    rm -rf /xpmem; \
+    ldconfig
+
+# NOTE: xpmem is not found correctly without setting the prefix in --enable-xpmem
 ARG libfabric_version=v2.4.0
 RUN set -eux; \
     git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \
@@ -80,7 +93,11 @@ RUN set -eux; \
       --with-cuda \
       --enable-cuda-dlopen \
       --enable-gdrcopy-dlopen \
-      --enable-cxi; \
+      --enable-xpmem=/usr \
+      --enable-tcp \
+      --enable-cxi \
+      --enable-lnx \
+      --enable-shm; \
     make -j"$(nproc)" install; \
     cd /; \
     rm -rf /libfabric; \

From b4071d03f696503dbe4e158c7308372bca3a3362 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 16:45:50 +0100
Subject: [PATCH 14/68] Linting

---
 model/common/src/icon4py/model/common/grid/utils.py  | 1 -
 model/testing/src/icon4py/model/testing/serialbox.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/model/common/src/icon4py/model/common/grid/utils.py b/model/common/src/icon4py/model/common/grid/utils.py
index 4af7b0a6ba..dbb3d69449 100644
--- a/model/common/src/icon4py/model/common/grid/utils.py
+++ b/model/common/src/icon4py/model/common/grid/utils.py
@@ -5,7 +5,6 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
-from types import ModuleType
 
 import numpy as np
 
diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py
index 05a3fc53fe..3bb52a9ed1 100644
--- a/model/testing/src/icon4py/model/testing/serialbox.py
+++ b/model/testing/src/icon4py/model/testing/serialbox.py
@@ -503,7 +503,9 @@ def construct_icon_grid(
             def potentially_revert_icon_index_transformation(ar):
                 return ar
         else:
-            potentially_revert_icon_index_transformation = grid_utils.revert_repeated_index_to_invalid
+            potentially_revert_icon_index_transformation = (
+                grid_utils.revert_repeated_index_to_invalid
+            )
 
         c2e2c = self.c2e2c()
         e2c2e = potentially_revert_icon_index_transformation(self.e2c2e())

From 6eb3d8d4379b10a9efedeadc84888b54c8e48852 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 30 Jan 2026 19:47:59 +0100
Subject: [PATCH 15/68] Enable GPU support for GHEX

---
 ci/docker/checkout_mpi.Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ci/docker/checkout_mpi.Dockerfile b/ci/docker/checkout_mpi.Dockerfile
index 4cbf1d32c0..01e26702b4 100644
--- a/ci/docker/checkout_mpi.Dockerfile
+++ b/ci/docker/checkout_mpi.Dockerfile
@@ -7,5 +7,9 @@ WORKDIR /icon4py
 ARG PYVERSION
 ARG VENV
 ENV UV_PROJECT_ENVIRONMENT=$VENV
-ENV MPI4PY_BUILD_BACKEND="scikit-build-core"
+ENV MPI4PY_BUILD_BACKEND=scikit-build-core
+ENV GHEX_USE_GPU=ON
+ENV GHEX_GPU_TYPE=NVIDIA
+ENV GHEX_GPU_ARCH=90
+ENV GHEX_TRANSPORT_BACKEND=MPI
 RUN uv sync --extra all --extra cuda12 --python=$PYVERSION

From 28b1b1bbdae5a5623f941b238df8106c1165cab6 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Sun, 1 Feb 2026 21:13:08 +0100
Subject: [PATCH 16/68] Set appropriate gcc for cuda

---
 ci/docker/base_mpi.Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index 6d00d12db9..eb46a926a9 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -27,6 +27,7 @@ RUN apt-get update && \
         libyaml-dev \
         nvidia-cuda-dev \
         nvidia-cuda-toolkit \
+        nvidia-cuda-toolkit-gcc \
         pkg-config \
         python3 \
         strace \
@@ -34,6 +35,10 @@ RUN apt-get update && \
         wget && \
     rm -rf /var/lib/apt/lists/*
 
+ENV CC=/usr/bin/cuda-gcc
+ENV CXX=/usr/bin/cuda-g++
+ENV CUDAHOSTCXX=/usr/bin/cuda-g++
+
 # Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps.
 ARG gdrcopy_version=2.5.1
 RUN set -eux; \

From 73a5b5bb1bf26a2d056b66b8db73eaa1a0538441 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Sun, 1 Feb 2026 21:13:20 +0100
Subject: [PATCH 17/68] Explicitly set OpenMPI settings

---
 ci/distributed.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 4d4d518b58..3c978a5f69 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -38,10 +38,6 @@ build_distributed_baseimage_aarch64:
     DOCKERFILE: ci/docker/checkout_mpi.Dockerfile
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-    USE_MPI: NO
-    SLURM_MPI_TYPE: pmix
-    PMIX_MCA_psec: native
-    PMIX_MCA_gds: "^shmem2"
 
 .build_distributed_cpu:
   extends: [.build_distributed_template]
@@ -66,6 +62,16 @@ build_distributed_cpu:
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
     CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/d126/icon4py/ci/testdata_003:$TEST_DATA_PATH"]'
+    # Do not use libfabric from the host system. Libfabric with slingshot
+    # support is built into the container image.
+    USE_MPI: NO
+    # Use libfabric slingshot (cxi) provider and recommended settings from
+    # https://docs.cscs.ch/software/communication/openmpi.
+    SLURM_MPI_TYPE: pmix
+    PMIX_MCA_psec: native
+    FI_PROVIDER: cxi
+    OMPI_MCA_pml: cm
+    OMPI_MCA_mtl: ofi
 
 .test_distributed_aarch64:
   stage: test

From d8e90e4fe01750202ea403fc88d3d44bdb282513 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 11:52:51 +0100
Subject: [PATCH 18/68] Don't dlopen cuda and gdrcopy

---
 ci/docker/base_mpi.Dockerfile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index eb46a926a9..383ffe04c9 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -96,8 +96,6 @@ RUN set -eux; \
     ./autogen.sh; \
     ./configure \
       --with-cuda \
-      --enable-cuda-dlopen \
-      --enable-gdrcopy-dlopen \
       --enable-xpmem=/usr \
       --enable-tcp \
       --enable-cxi \

From 67cfdb51d077ddbeb209e568496f6349eda4eceb Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 13:29:24 +0100
Subject: [PATCH 19/68] Update comments and clean up options

---
 ci/docker/base_mpi.Dockerfile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index 383ffe04c9..f849c4d626 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -39,7 +39,9 @@ ENV CC=/usr/bin/cuda-gcc
 ENV CXX=/usr/bin/cuda-g++
 ENV CUDAHOSTCXX=/usr/bin/cuda-g++
 
-# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use on Alps.
+# Install OpenMPI configured with libfabric, libcxi, and gdrcopy support for use
+# on Alps. This is based on examples in
+# https://github.com/eth-cscs/cray-network-stack.
 ARG gdrcopy_version=2.5.1
 RUN set -eux; \
     git clone --depth 1 --branch "v${gdrcopy_version}" https://github.com/NVIDIA/gdrcopy.git; \
@@ -88,7 +90,8 @@ RUN set -eux; \
     rm -rf /xpmem; \
     ldconfig
 
-# NOTE: xpmem is not found correctly without setting the prefix in --enable-xpmem
+# NOTE: xpmem is not found correctly without setting the prefix explicitly in
+# --enable-xpmem
 ARG libfabric_version=v2.4.0
 RUN set -eux; \
     git clone --depth 1 --branch "${libfabric_version}" https://github.com/ofiwg/libfabric.git; \
@@ -98,9 +101,7 @@ RUN set -eux; \
       --with-cuda \
       --enable-xpmem=/usr \
       --enable-tcp \
-      --enable-cxi \
-      --enable-lnx \
-      --enable-shm; \
+      --enable-cxi; \
     make -j"$(nproc)" install; \
     cd /; \
     rm -rf /libfabric; \

From c81af9ebdb020011204538f8b1008d66f9e8d4f4 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 13:29:38 +0100
Subject: [PATCH 20/68] Try ubuntu lts release for distributed ci

---
 ci/docker/base_mpi.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index f849c4d626..c48241855e 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:25.04
+FROM ubuntu:24.04
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

From 790612a0ee7b1e5bd390169e7b15a3c50913d39b Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 13:31:51 +0100
Subject: [PATCH 21/68] Set gpu binding through SLURM_GPUS_PER_TASK

---
 ci/distributed.yml        | 1 +
 scripts/ci-mpi-wrapper.sh | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 3c978a5f69..c0d835e2fe 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -58,6 +58,7 @@ build_distributed_cpu:
     SLURM_JOB_NUM_NODES: 1
     SLURM_CPU_BIND: 'verbose'
     SLURM_NTASKS: 4
+    SLURM_GPUS_PER_TASK: 1
     TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
diff --git a/scripts/ci-mpi-wrapper.sh b/scripts/ci-mpi-wrapper.sh
index c0aa25d41f..900dd340ae 100755
--- a/scripts/ci-mpi-wrapper.sh
+++ b/scripts/ci-mpi-wrapper.sh
@@ -17,8 +17,6 @@ else
     exit 1
 fi
 
-export CUDA_VISIBLE_DEVICES="${rank}"
-
 log_file="${CI_PROJECT_DIR:+${CI_PROJECT_DIR}/}pytest-log-rank-${rank}.txt"
 
 if [[ "${rank}" -eq 0 ]]; then

From 64482e8fa1eefb5200ac3fbe78406d00e07093c9 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 13:32:32 +0100
Subject: [PATCH 22/68] Enable all tests again

---
 ci/distributed.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index c0d835e2fe..d8e2a1068c 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -86,10 +86,8 @@ build_distributed_cpu:
     - scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
   parallel:
     matrix:
-      # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
-      - COMPONENT: [common]
-        # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu]
-        BACKEND: [dace_cpu, dace_gpu]
+      - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
+        BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'
       variables:

From b3eef3a6c78072d59df6009425a39fa7a6eaf24d Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 13:33:25 +0100
Subject: [PATCH 23/68] Clean up names in distributed.yml

---
 ci/distributed.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index d8e2a1068c..f8600e85b1 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -39,21 +39,21 @@ build_distributed_baseimage_aarch64:
     DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}", "VENV=${UV_PROJECT_ENVIRONMENT}"]'
     PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
 
-.build_distributed_cpu:
+.build_distributed:
   extends: [.build_distributed_template]
   variables:
     UV_PROJECT_ENVIRONMENT: venv_dist
 
-build_distributed_cpu:
+build_distributed:
   stage: image
-  extends: [.container-builder-cscs-gh200, .build_distributed_cpu]
+  extends: [.container-builder-cscs-gh200, .build_distributed]
   needs: [build_distributed_baseimage_aarch64]
 
 .test_template_distributed:
   timeout: 8h
   image: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$UV_PROJECT_ENVIRONMENT-$PYVERSION-mpi
-  extends: [.container-runner-santis-gh200, .build_distributed_cpu]
-  needs: [build_distributed_cpu]
+  extends: [.container-runner-santis-gh200, .build_distributed]
+  needs: [build_distributed]
   variables:
     SLURM_JOB_NUM_NODES: 1
     SLURM_CPU_BIND: 'verbose'

From d6f71d60fb49e6d92fe1a185aaf6a061a654bcc1 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 3 Feb 2026 16:02:30 +0100
Subject: [PATCH 24/68] Update base image to ubuntu 25.10

---
 ci/docker/base_mpi.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/base_mpi.Dockerfile b/ci/docker/base_mpi.Dockerfile
index c48241855e..a600b4ff1c 100644
--- a/ci/docker/base_mpi.Dockerfile
+++ b/ci/docker/base_mpi.Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04
+FROM ubuntu:25.10
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

From 518bbdee8c8d92e884267b4fd5a157eaaac29b2e Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 4 Feb 2026 14:19:58 +0100
Subject: [PATCH 25/68] Mark distributed compute_geofac_div test embedded only,
 like single-rank test

---
 .../common/decomposition/mpi_tests/test_mpi_decomposition.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
index 5bf956428d..d8f6f2aa88 100644
--- a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
+++ b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
@@ -280,6 +280,7 @@ def test_exchange_on_dummy_data(
 
 @pytest.mark.mpi
 @pytest.mark.datatest
+@pytest.mark.embedded_only
 @pytest.mark.parametrize("processor_props", [False], indirect=True)
 def test_halo_exchange_for_sparse_field(
     interpolation_savepoint: serialbox.InterpolationSavepoint,

From c1eed7f8cc6a57fcc7c96ce55c511fa1f4ed08eb Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 4 Feb 2026 15:19:00 +0100
Subject: [PATCH 26/68] Use philip's async-mpi branch (fixes gpu buffer stride
 computation)

---
 pyproject.toml | 2 +-
 uv.lock        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e349356eb7..df2c6e3d98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -361,7 +361,7 @@ url = 'https://gridtools.github.io/pypi/'
 
 [tool.uv.sources]
 dace = {index = "gridtools"}
-ghex = {git = "https://github.com/msimberg/GHEX.git", branch = "async-mpi"}
+ghex = {git = "https://github.com/philip-paul-mueller/GHEX.git", branch = "phimuell__async-mpi-2"}
 # gt4py = {git = "https://github.com/GridTools/gt4py", branch = "main"}
 # gt4py = {index = "test.pypi"}
 icon4py-atmosphere-advection = {workspace = true}
diff --git a/uv.lock b/uv.lock
index f5641ba1e4..aca8ec23cc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1362,7 +1362,7 @@ wheels = [
 [[package]]
 name = "ghex"
 version = "0.4.1"
-source = { git = "https://github.com/msimberg/GHEX.git?branch=async-mpi#6d896166994cedbcfc50da1873239a5edb212e3f" }
+source = { git = "https://github.com/philip-paul-mueller/GHEX.git?branch=phimuell__async-mpi-2#80c0650fdae40bdd40e0435e5687267bada4cdd2" }
 dependencies = [
     { name = "mpi4py" },
     { name = "numpy" },
@@ -1887,7 +1887,7 @@ requires-dist = [
     { name = "cupy-cuda12x", marker = "extra == 'cuda12'", specifier = ">=13.0" },
     { name = "dace", specifier = "==43!2026.1.21", index = "https://gridtools.github.io/pypi/" },
     { name = "datashader", marker = "extra == 'io'", specifier = ">=0.16.1" },
-    { name = "ghex", marker = "extra == 'distributed'", git = "https://github.com/msimberg/GHEX.git?branch=async-mpi" },
+    { name = "ghex", marker = "extra == 'distributed'", git = "https://github.com/philip-paul-mueller/GHEX.git?branch=phimuell__async-mpi-2" },
     { name = "gt4py", specifier = "==1.1.3" },
     { name = "gt4py", extras = ["cuda11"], marker = "extra == 'cuda11'" },
     { name = "gt4py", extras = ["cuda12"], marker = "extra == 'cuda12'" },

From d08b60cf14d69dd4c3ec16e546621e64ea0d1ba9 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 4 Feb 2026 16:44:34 +0100
Subject: [PATCH 27/68] Increase time limit for distributed dace tests

---
 ci/distributed.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 3a262d7de0..1828a3f4ea 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -92,9 +92,9 @@ build_distributed:
     - if: $COMPONENT == 'atmosphere/diffusion'
       variables:
         SLURM_TIMELIMIT: '00:05:00'
-    - if: $COMPONENT == 'atmosphere/dycore' && $BACKEND == 'dace_cpu'
+    - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu')
       variables:
-        SLURM_TIMELIMIT: '00:20:00'
+        SLURM_TIMELIMIT: '00:30:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From 148850c271ccc19c6c8b333b0190c247e19cb2bd Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 4 Feb 2026 16:56:17 +0100
Subject: [PATCH 28/68] Increase time limit for distributed dace_gpu common
 tests

---
 ci/distributed.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 1828a3f4ea..8c22a08611 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -95,6 +95,9 @@ build_distributed:
     - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu')
       variables:
         SLURM_TIMELIMIT: '00:30:00'
+    - if: $COMPONENT == 'common' && $BACKEND == 'dace_gpu'
+      variables:
+        SLURM_TIMELIMIT: '00:45:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From 0c727f58ff443cf7d049a36bb5d383d0603ec00e Mon Sep 17 00:00:00 2001
From: Jacopo Canton <jacopo.canton@gmail.com>
Date: Thu, 5 Feb 2026 12:52:25 +0100
Subject: [PATCH 29/68] sorry2

---
 ci/distributed.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 192838a0f5..4b4038d047 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -59,7 +59,7 @@ build_distributed:
     SLURM_CPU_BIND: 'verbose'
     SLURM_NTASKS: 4
     SLURM_GPUS_PER_TASK: 1
-    TEST_DATA_PATH: "/icon4py/testdata"
+    ICON4PY_TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
     CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/cwci02/icon4py/ci/testdata:$ICON4PY_TEST_DATA_PATH"]'

From ce21e8f73a27ef1af726230bf7a89f6493f84f25 Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Wed, 4 Mar 2026 16:11:54 +0100
Subject: [PATCH 30/68] modified np strict references with broader array_ns

---
 .../common/interpolation/interpolation_fields.py  | 15 ++++++++++-----
 .../src/icon4py/model/common/math/projection.py   | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index c04aa16f20..c1c0c2b954 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1181,11 +1181,12 @@ def compute_lsq_weights_c(
     lsq_weights_c_jc: data_alloc.NDArray,
     lsq_dim_stencil: int,
     lsq_wgt_exp: int,
+    array_ns: ModuleType = np,
 ) -> data_alloc.NDArray:
     for js in range(lsq_dim_stencil):
-        z_norm = np.sqrt(np.dot(z_dist_g[js, :], z_dist_g[js, :]))
+        z_norm = array_ns.sqrt(array_ns.dot(z_dist_g[js, :], z_dist_g[js, :]))
         lsq_weights_c_jc[js] = 1.0 / (z_norm**lsq_wgt_exp)
-    return lsq_weights_c_jc / np.max(lsq_weights_c_jc)
+    return lsq_weights_c_jc / array_ns.max(lsq_weights_c_jc)
 
 
 def compute_z_lsq_mat_c(
@@ -1234,9 +1235,13 @@ def compute_lsq_coeffs(
     match base_grid.GeometryType(geometry_type):
         case base_grid.GeometryType.ICOSAHEDRON:
             for js in range(lsq_dim_stencil):
-                z_dist_g[:, js, :] = np.asarray(
+                z_dist_g[:, js, :] = array_ns.asarray(
                     gnomonic_proj(
-                        cell_lon[:], cell_lat[:], cell_lon[c2e2c[:, js]], cell_lat[c2e2c[:, js]]
+                        cell_lon[:],
+                        cell_lat[:],
+                        cell_lon[c2e2c[:, js]],
+                        cell_lat[c2e2c[:, js]],
+                        array_ns,
                     )
                 ).T
 
@@ -1265,7 +1270,7 @@ def compute_lsq_coeffs(
 
     for jc in range(start_idx, min_rlcell_int):
         lsq_weights_c[jc, :] = compute_lsq_weights_c(
-            z_dist_g[jc, :, :], lsq_weights_c[jc, :], lsq_dim_stencil, lsq_wgt_exp
+            z_dist_g[jc, :, :], lsq_weights_c[jc, :], lsq_dim_stencil, lsq_wgt_exp, array_ns
         )
         z_lsq_mat_c[jc, js, :lsq_dim_unk] = compute_z_lsq_mat_c(
             cell_owner_mask,
diff --git a/model/common/src/icon4py/model/common/math/projection.py b/model/common/src/icon4py/model/common/math/projection.py
index fcec8fbc5f..a696cbbadb 100644
--- a/model/common/src/icon4py/model/common/math/projection.py
+++ b/model/common/src/icon4py/model/common/math/projection.py
@@ -5,7 +5,7 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
-
+from types import ModuleType
 
 import numpy as np
 
@@ -17,6 +17,7 @@ def gnomonic_proj(
     lat_c: data_alloc.NDArray,
     lon: data_alloc.NDArray,
     lat: data_alloc.NDArray,
+    array_ns: ModuleType = np,
 ) -> tuple[data_alloc.NDArray, data_alloc.NDArray]:
     """
     Compute gnomonic projection.
@@ -38,11 +39,16 @@ def gnomonic_proj(
     TODO:
         replace this with a suitable library call
     """
-    cosc = np.sin(lat_c) * np.sin(lat) + np.cos(lat_c) * np.cos(lat) * np.cos(lon - lon_c)
+    cosc = array_ns.sin(lat_c) * array_ns.sin(lat) + array_ns.cos(lat_c) * array_ns.cos(
+        lat
+    ) * array_ns.cos(lon - lon_c)
     zk = 1.0 / cosc
 
-    x = zk * np.cos(lat) * np.sin(lon - lon_c)
-    y = zk * (np.cos(lat_c) * np.sin(lat) - np.sin(lat_c) * np.cos(lat) * np.cos(lon - lon_c))
+    x = zk * array_ns.cos(lat) * array_ns.sin(lon - lon_c)
+    y = zk * (
+        array_ns.cos(lat_c) * array_ns.sin(lat)
+        - array_ns.sin(lat_c) * array_ns.cos(lat) * array_ns.cos(lon - lon_c)
+    )
 
     return x, y
 

From 878db70bcc15993c41e62f57aadadc3a0097e1bd Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Thu, 5 Mar 2026 15:06:45 +0100
Subject: [PATCH 31/68] Update interpolation_fields.py

---
 .../model/common/interpolation/interpolation_fields.py       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index c1c0c2b954..e2eb4803d3 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -13,7 +13,6 @@
 from typing import Final
 
 import numpy as np
-import scipy
 from gt4py import next as gtx
 from gt4py.next import where
 
@@ -1163,11 +1162,12 @@ def compute_lsq_pseudoinv(
     min_rlcell_int: int,
     lsq_dim_unk: int,
     lsq_dim_c: int,
+    array_ns: ModuleType = np,
 ) -> data_alloc.NDArray:
     for jjb in range(lsq_dim_c):
         for jjk in range(lsq_dim_unk):
             for jc in range(start_idx, min_rlcell_int):
-                u, s, v_t, _ = scipy.linalg.lapack.dgesdd(z_lsq_mat_c[jc, :, :])
+                u, s, v_t, _ = array_ns.linalg.svd(z_lsq_mat_c[jc, :, :])
                 if cell_owner_mask[jc]:
                     lsq_pseudoinv[jc, :lsq_dim_unk, jjb] = (
                         lsq_pseudoinv[jc, :lsq_dim_unk, jjb]
@@ -1294,6 +1294,7 @@ def compute_lsq_coeffs(
         min_rlcell_int,
         lsq_dim_unk,
         lsq_dim_c,
+        array_ns
     )
     if exchange != decomposition.single_node_default:
         exchange(lsq_pseudoinv[:, 0, :])

From c449030065bc88edc414e5060c1f7a9916bc717e Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Thu, 5 Mar 2026 15:08:53 +0100
Subject: [PATCH 32/68] ran pre-commit

---
 .../icon4py/model/common/interpolation/interpolation_fields.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index e2eb4803d3..b74df27ac7 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1294,7 +1294,7 @@ def compute_lsq_coeffs(
         min_rlcell_int,
         lsq_dim_unk,
         lsq_dim_c,
-        array_ns
+        array_ns,
     )
     if exchange != decomposition.single_node_default:
         exchange(lsq_pseudoinv[:, 0, :])

From 9460369d6e2d3facb86e3fd31680202197686b55 Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Thu, 5 Mar 2026 15:34:08 +0100
Subject: [PATCH 33/68] removed additional but unused return val

---
 .../icon4py/model/common/interpolation/interpolation_fields.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index b74df27ac7..4d0d6c7f74 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1167,7 +1167,7 @@ def compute_lsq_pseudoinv(
     for jjb in range(lsq_dim_c):
         for jjk in range(lsq_dim_unk):
             for jc in range(start_idx, min_rlcell_int):
-                u, s, v_t, _ = array_ns.linalg.svd(z_lsq_mat_c[jc, :, :])
+                u, s, v_t = array_ns.linalg.svd(z_lsq_mat_c[jc, :, :])
                 if cell_owner_mask[jc]:
                     lsq_pseudoinv[jc, :lsq_dim_unk, jjb] = (
                         lsq_pseudoinv[jc, :lsq_dim_unk, jjb]

From c3606ae66b1da39da7c9ee87c7c1106b553c4692 Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Thu, 5 Mar 2026 16:21:57 +0100
Subject: [PATCH 34/68] Update interpolation_fields.py

---
 .../model/common/interpolation/interpolation_fields.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index 4d0d6c7f74..8a8d92318f 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1258,14 +1258,14 @@ def compute_lsq_coeffs(
 
                 cc_cv = (cell_center_x[jc], cell_center_y[jc])
                 for js in range(lsq_dim_stencil):
-                    cc_cell[js, :] = diff_on_edges_torus_numpy(
+                    cc_cell[js, :] = array_ns.asarray(diff_on_edges_torus_numpy(
                         cell_center_x[jc],
                         cell_center_y[jc],
                         cell_center_x[ilc_s][js],
                         cell_center_y[ilc_s][js],
                         domain_length,
                         domain_height,
-                    )
+                    ))
                 z_dist_g[jc, :, :] = cc_cell - cc_cv
 
     for jc in range(start_idx, min_rlcell_int):

From 81375cac88716783b1b3dca309e272ace34873f5 Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Thu, 5 Mar 2026 16:27:51 +0100
Subject: [PATCH 35/68] ran pre-commit

---
 .../interpolation/interpolation_fields.py      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index 8a8d92318f..49532d5219 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1258,14 +1258,16 @@ def compute_lsq_coeffs(
 
                 cc_cv = (cell_center_x[jc], cell_center_y[jc])
                 for js in range(lsq_dim_stencil):
-                    cc_cell[js, :] = array_ns.asarray(diff_on_edges_torus_numpy(
-                        cell_center_x[jc],
-                        cell_center_y[jc],
-                        cell_center_x[ilc_s][js],
-                        cell_center_y[ilc_s][js],
-                        domain_length,
-                        domain_height,
-                    ))
+                    cc_cell[js, :] = array_ns.asarray(
+                        diff_on_edges_torus_numpy(
+                            cell_center_x[jc],
+                            cell_center_y[jc],
+                            cell_center_x[ilc_s][js],
+                            cell_center_y[ilc_s][js],
+                            domain_length,
+                            domain_height,
+                        )
+                    )
                 z_dist_g[jc, :, :] = cc_cell - cc_cv
 
     for jc in range(start_idx, min_rlcell_int):

From 6362e62be71d482a25e98b8584e1c2abc682d310 Mon Sep 17 00:00:00 2001
From: Nicoletta Farabullini <41536517+nfarabullini@users.noreply.github.com>
Date: Fri, 6 Mar 2026 08:49:08 +0100
Subject: [PATCH 36/68] small fix to tuple

---
 .../icon4py/model/common/interpolation/interpolation_fields.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
index 49532d5219..7b67e7c893 100644
--- a/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
+++ b/model/common/src/icon4py/model/common/interpolation/interpolation_fields.py
@@ -1256,7 +1256,7 @@ def compute_lsq_coeffs(
                 ilc_s = c2e2c[jc, :lsq_dim_stencil]
                 cc_cell = array_ns.zeros((lsq_dim_stencil, 2))
 
-                cc_cv = (cell_center_x[jc], cell_center_y[jc])
+                cc_cv = array_ns.asarray((cell_center_x[jc], cell_center_y[jc]))
                 for js in range(lsq_dim_stencil):
                     cc_cell[js, :] = array_ns.asarray(
                         diff_on_edges_torus_numpy(

From 000efca9298cddf0a14ff61251dd5cf4259b405b Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 12 Mar 2026 15:30:45 +0100
Subject: [PATCH 37/68] Fix numpy/cupy inconsistency in
 test_parallel_grid_manager.py

---
 .../mpi_tests/test_parallel_grid_manager.py   | 19 +++++++++++++------
 .../tests/common/grid/mpi_tests/utils.py      | 11 ++++++++---
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 70c61ba350..b416c8f7ec 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -137,10 +137,10 @@ def check_local_global_field(
     if check_halos:
         np.testing.assert_allclose(
             global_reference_field[
-                decomposition_info.global_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO)
+                data_alloc.as_numpy(decomposition_info.global_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO))
             ],
             local_field[
-                decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO)
+                data_alloc.as_numpy(decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO))
             ],
             atol=1e-9,
             verbose=True,
@@ -150,7 +150,7 @@ def check_local_global_field(
     # field, by gathering owned entries to the first rank. This ensures that in
     # total we have the full global field distributed on all ranks.
     owned_entries = local_field[
-        decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED)
+        data_alloc.as_numpy(decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED))
     ]
     gathered_sizes, gathered_field = gather_field(owned_entries, processor_props)
 
@@ -262,11 +262,13 @@ def test_geometry_fields_compare_single_multi_rank(
     if attrs_name in embedded_broken_fields and test_utils.is_embedded(backend):
         pytest.xfail(f"Field {attrs_name} can't be computed with the embedded backend")
 
+    allocator = model_backends.get_allocator(backend)
+
     # TODO(msimberg): Add fixtures for single/multi-rank
     # grid/geometry/interpolation/metrics factories.
     grid_file = grid_utils._download_grid_file(grid_description)
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(grid_file)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(grid_file, allocator=allocator)
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
@@ -283,6 +285,7 @@ def test_geometry_fields_compare_single_multi_rank(
         file=grid_file,
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
+        allocator=allocator,
     )
     _log.info(
         f"rank = {processor_props.rank} : {multi_rank_grid_manager.decomposition_info.get_horizontal_size()!r}"
@@ -359,9 +362,11 @@ def test_interpolation_fields_compare_single_multi_rank(
     if attrs_name in embedded_broken_fields and test_utils.is_embedded(backend):
         pytest.xfail(f"Field {attrs_name} can't be computed with the embedded backend")
 
+    allocator = model_backends.get_allocator(backend)
+
     file = grid_utils.resolve_full_grid_file_name(experiment.grid)
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, allocator=allocator)
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
@@ -386,6 +391,7 @@ def test_interpolation_fields_compare_single_multi_rank(
         file=file,
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
+        allocator=allocator,
     )
     _log.info(
         f"rank = {processor_props.rank} : {multi_rank_grid_manager.decomposition_info.get_horizontal_size()!r}"
@@ -535,7 +541,7 @@ def test_metrics_fields_compare_single_multi_rank(
     )
 
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, experiment.num_levels)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, experiment.num_levels, allocator=allocator)
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
@@ -584,6 +590,7 @@ def test_metrics_fields_compare_single_multi_rank(
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
         num_levels=experiment.num_levels,
+        allocator=allocator,
     )
     _log.info(
         f"rank = {processor_props.rank} : {multi_rank_grid_manager.decomposition_info.get_horizontal_size()!r}"
diff --git a/model/common/tests/common/grid/mpi_tests/utils.py b/model/common/tests/common/grid/mpi_tests/utils.py
index 511ec82f77..c94eeaa3af 100644
--- a/model/common/tests/common/grid/mpi_tests/utils.py
+++ b/model/common/tests/common/grid/mpi_tests/utils.py
@@ -8,6 +8,8 @@
 
 import pathlib
 
+import gt4py.next as gtx
+
 from icon4py.model.common.decomposition import decomposer as decomp, definitions as decomp_defs
 from icon4py.model.common.grid import grid_manager as gm, vertical as v_grid
 
@@ -22,14 +24,16 @@ def _grid_manager(file: pathlib.Path, num_levels: int) -> gm.GridManager:
 
 
 def run_grid_manager_for_single_rank(
-    file: pathlib.Path, num_levels: int = NUM_LEVELS
+    file: pathlib.Path,
+    allocator: gtx.typing.Allocator,
+    num_levels: int = NUM_LEVELS,
 ) -> gm.GridManager:
     manager = _grid_manager(file, num_levels)
     manager(
         keep_skip_values=True,
         run_properties=decomp_defs.SingleNodeProcessProperties(),
         decomposer=decomp.SingleNodeDecomposer(),
-        allocator=None,
+        allocator=allocator,
     )
     return manager
 
@@ -38,10 +42,11 @@ def run_grid_manager_for_multi_rank(
     file: pathlib.Path,
     run_properties: decomp_defs.ProcessProperties,
     decomposer: decomp.Decomposer,
+    allocator: gtx.typing.Allocator,
     num_levels: int = NUM_LEVELS,
 ) -> gm.GridManager:
     manager = _grid_manager(file, num_levels)
     manager(
-        keep_skip_values=True, allocator=None, run_properties=run_properties, decomposer=decomposer
+        keep_skip_values=True, allocator=allocator, run_properties=run_properties, decomposer=decomposer
     )
     return manager

From b0c8f5e674f38a09873e4b878f843601113e748a Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 12 Mar 2026 15:57:24 +0100
Subject: [PATCH 38/68] Loosen rbf tolerance again for gpu

---
 .../tests/common/grid/mpi_tests/test_parallel_grid_manager.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index b416c8f7ec..da82327a0d 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -177,7 +177,8 @@ def check_local_global_field(
             f" rank = {processor_props.rank}: SHAPES: global reference field {global_reference_field.shape}, gathered = {gathered_field.shape}"
         )
 
-        np.testing.assert_allclose(sorted_, global_reference_field, atol=1e-9, verbose=True)
+        # TODO(msimberg): The tolerance is high only for RBF fields. Fix it.
+        np.testing.assert_allclose(sorted_, global_reference_field, atol=3e-9, verbose=True)
 
 
 # These fields can't be computed with the embedded backend for one reason or

From f7f7dcdfb39339ad5892bd9959e94ff70f831ba5 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 12 Mar 2026 16:16:38 +0100
Subject: [PATCH 39/68] Fix allocator argument

---
 .../tests/common/grid/mpi_tests/test_parallel_grid_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index da82327a0d..14183a137f 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -542,7 +542,7 @@ def test_metrics_fields_compare_single_multi_rank(
     )
 
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, experiment.num_levels, allocator=allocator)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, allocator=allocator, num_levels=experiment.num_levels)
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
@@ -590,8 +590,8 @@ def test_metrics_fields_compare_single_multi_rank(
         file=file,
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
-        num_levels=experiment.num_levels,
         allocator=allocator,
+        num_levels=experiment.num_levels,
     )
     _log.info(
         f"rank = {processor_props.rank} : {multi_rank_grid_manager.decomposition_info.get_horizontal_size()!r}"

From 17443181290dd7b4e0073a81b3c5c1fca8a8e761 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Mon, 16 Mar 2026 14:50:44 +0100
Subject: [PATCH 40/68] Specify backend for all metrics fields

---
 .../src/icon4py/model/common/metrics/metrics_factory.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model/common/src/icon4py/model/common/metrics/metrics_factory.py b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
index 1cc16324a3..f74180348d 100644
--- a/model/common/src/icon4py/model/common/metrics/metrics_factory.py
+++ b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
@@ -870,7 +870,7 @@ def _register_computed_fields(self) -> None:  # noqa: PLR0915 [too-many-statemen
         self.register_provider(compute_maxslp_maxhgtd)
 
         compute_weighted_cell_neighbor_sum = factory.ProgramFieldProvider(
-            func=mf.compute_weighted_cell_neighbor_sum,
+            func=mf.compute_weighted_cell_neighbor_sum.with_backend(self._backend),
             deps={
                 "maxslp": attrs.MAXSLP,
                 "maxhgtd": attrs.MAXHGTD,
@@ -966,7 +966,7 @@ def _register_computed_fields(self) -> None:  # noqa: PLR0915 [too-many-statemen
         self.register_provider(compute_diffusion_intcoef_and_vertoffset)
 
         compute_advection_deepatmo_fields = factory.ProgramFieldProvider(
-            func=compute_advection_metrics.compute_advection_deepatmo_fields,
+            func=compute_advection_metrics.compute_advection_deepatmo_fields.with_backend(self._backend),
             domain={
                 dims.KDim: (
                     vertical_domain(v_grid.Zone.TOP),

From 599505872c3513179fd50d94d098f3d018852053 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Mon, 16 Mar 2026 17:34:30 +0100
Subject: [PATCH 41/68] Add missing allocator to
 test_parallel_grid_refinement.py

---
 .../common/grid/mpi_tests/test_parallel_grid_refinement.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
index ce8984e071..80d1039154 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
@@ -11,7 +11,7 @@
 import gt4py.next as gtx
 import pytest
 
-from icon4py.model.common import dimension as dims
+from icon4py.model.common import dimension as dims, model_backends
 from icon4py.model.common.decomposition import (
     decomposer as decomp,
     definitions as decomposition,
@@ -123,6 +123,7 @@ def test_bounds_decomposition(
         file=file,
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
+        allocator=model_backends.get_allocator(backend),
     )
     _log.info(
         f"rank = {processor_props.rank} : {grid_manager.decomposition_info.get_horizontal_size()!r}"

From 2ff0109c88ac1e2de163518312812d4876e2e0df Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Tue, 17 Mar 2026 11:05:13 +0100
Subject: [PATCH 42/68] Add another missing allocator

---
 .../tests/common/grid/mpi_tests/test_parallel_grid_manager.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 14183a137f..940053e1dc 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -725,6 +725,7 @@ def test_metrics_mask_prog_halo_c(
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
         num_levels=experiment.num_levels,
+        allocator=model_backends.get_allocator(backend),
     )
     _log.info(
         f"rank = {processor_props.rank} : {multi_rank_grid_manager.decomposition_info.get_horizontal_size()!r}"

From f802f985a2d05ea5fa2597d4b4f90ac5605a92ab Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Tue, 17 Mar 2026 11:08:13 +0100
Subject: [PATCH 43/68] More allocators

---
 .../tests/common/grid/mpi_tests/test_parallel_grid_manager.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 940053e1dc..486bb3889b 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -809,6 +809,7 @@ def test_metrics_mask_prog_halo_c(
 def test_validate_skip_values_in_distributed_connectivities(
     processor_props: decomp_defs.ProcessProperties,
     experiment: test_defs.Experiment,
+    backend: gtx_typing.Backend | None,
 ) -> None:
     if experiment == test_defs.Experiments.MCH_CH_R04B09:
         pytest.xfail("Limited-area grids not yet supported")
@@ -818,6 +819,7 @@ def test_validate_skip_values_in_distributed_connectivities(
         file=file,
         run_properties=processor_props,
         decomposer=decomp.MetisDecomposer(),
+        allocator=model_backends.get_allocator(backend),
     )
     distributed_grid = multi_rank_grid_manager.grid
     for k, c in distributed_grid.connectivities.items():
@@ -841,6 +843,7 @@ def test_validate_skip_values_in_distributed_connectivities(
 def test_limited_area_raises(
     processor_props: decomp_defs.ProcessProperties,
     grid: test_defs.GridDescription,
+    backend: gtx_typing.Backend | None,
 ) -> None:
     with pytest.raises(
         NotImplementedError, match="Limited-area grids are not supported in distributed runs"
@@ -849,4 +852,5 @@ def test_limited_area_raises(
             file=grid_utils.resolve_full_grid_file_name(grid),
             run_properties=processor_props,
             decomposer=decomp.MetisDecomposer(),
+            allocator=model_backends.get_allocator(backend),
         )

From 8d78f0ba4bcdb29ed4b4e0a2424296999d6a1774 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Tue, 17 Mar 2026 11:24:01 +0100
Subject: [PATCH 44/68] Increase timeout in distributed tests

---
 ci/distributed.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index d1f93c5309..aad1bd3795 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -96,9 +96,9 @@ build_distributed:
     - if: $COMPONENT == 'atmosphere/dycore' && ($BACKEND == 'dace_cpu' || $BACKEND == 'dace_gpu')
       variables:
         SLURM_TIMELIMIT: '00:30:00'
-    - if: $COMPONENT == 'common' && $BACKEND == 'dace_gpu'
+    - if: $COMPONENT == 'common' && ($BACKEND == 'dace_gpu' || $BACKEND == 'gtfn_gpu')
       variables:
-        SLURM_TIMELIMIT: '00:45:00'
+        SLURM_TIMELIMIT: '00:60:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From addef83614771ae7d2256966faffd2d4b8c02908 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 17 Mar 2026 12:02:51 +0100
Subject: [PATCH 45/68] Format files

---
 .../model/common/metrics/metrics_factory.py   |  4 +++-
 .../mpi_tests/test_parallel_grid_manager.py   | 24 +++++++++++++++----
 .../tests/common/grid/mpi_tests/utils.py      |  5 +++-
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/model/common/src/icon4py/model/common/metrics/metrics_factory.py b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
index f74180348d..1f96d4230d 100644
--- a/model/common/src/icon4py/model/common/metrics/metrics_factory.py
+++ b/model/common/src/icon4py/model/common/metrics/metrics_factory.py
@@ -966,7 +966,9 @@ def _register_computed_fields(self) -> None:  # noqa: PLR0915 [too-many-statemen
         self.register_provider(compute_diffusion_intcoef_and_vertoffset)
 
         compute_advection_deepatmo_fields = factory.ProgramFieldProvider(
-            func=compute_advection_metrics.compute_advection_deepatmo_fields.with_backend(self._backend),
+            func=compute_advection_metrics.compute_advection_deepatmo_fields.with_backend(
+                self._backend
+            ),
             domain={
                 dims.KDim: (
                     vertical_domain(v_grid.Zone.TOP),
diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 486bb3889b..3c042df440 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -137,10 +137,18 @@ def check_local_global_field(
     if check_halos:
         np.testing.assert_allclose(
             global_reference_field[
-                data_alloc.as_numpy(decomposition_info.global_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO))
+                data_alloc.as_numpy(
+                    decomposition_info.global_index(
+                        dim, decomp_defs.DecompositionInfo.EntryType.HALO
+                    )
+                )
             ],
             local_field[
-                data_alloc.as_numpy(decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.HALO))
+                data_alloc.as_numpy(
+                    decomposition_info.local_index(
+                        dim, decomp_defs.DecompositionInfo.EntryType.HALO
+                    )
+                )
             ],
             atol=1e-9,
             verbose=True,
@@ -150,7 +158,9 @@ def check_local_global_field(
     # field, by gathering owned entries to the first rank. This ensures that in
     # total we have the full global field distributed on all ranks.
     owned_entries = local_field[
-        data_alloc.as_numpy(decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED))
+        data_alloc.as_numpy(
+            decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED)
+        )
     ]
     gathered_sizes, gathered_field = gather_field(owned_entries, processor_props)
 
@@ -269,7 +279,9 @@ def test_geometry_fields_compare_single_multi_rank(
     # grid/geometry/interpolation/metrics factories.
     grid_file = grid_utils._download_grid_file(grid_description)
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(grid_file, allocator=allocator)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(
+        grid_file, allocator=allocator
+    )
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
@@ -542,7 +554,9 @@ def test_metrics_fields_compare_single_multi_rank(
     )
 
     _log.info(f"running on {processor_props.comm} with {processor_props.comm_size} ranks")
-    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(file, allocator=allocator, num_levels=experiment.num_levels)
+    single_rank_grid_manager = utils.run_grid_manager_for_single_rank(
+        file, allocator=allocator, num_levels=experiment.num_levels
+    )
     single_rank_geometry = geometry.GridGeometry(
         backend=backend,
         grid=single_rank_grid_manager.grid,
diff --git a/model/common/tests/common/grid/mpi_tests/utils.py b/model/common/tests/common/grid/mpi_tests/utils.py
index c94eeaa3af..ca771e1d3c 100644
--- a/model/common/tests/common/grid/mpi_tests/utils.py
+++ b/model/common/tests/common/grid/mpi_tests/utils.py
@@ -47,6 +47,9 @@ def run_grid_manager_for_multi_rank(
 ) -> gm.GridManager:
     manager = _grid_manager(file, num_levels)
     manager(
-        keep_skip_values=True, allocator=allocator, run_properties=run_properties, decomposer=decomposer
+        keep_skip_values=True,
+        allocator=allocator,
+        run_properties=run_properties,
+        decomposer=decomposer,
     )
     return manager

From 5d97f1da73cdc1f02676f151b240487948d8feb7 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Tue, 17 Mar 2026 14:00:00 +0100
Subject: [PATCH 46/68] Increase timelimit further

---
 ci/distributed.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index aad1bd3795..a023be0c21 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -98,7 +98,8 @@ build_distributed:
         SLURM_TIMELIMIT: '00:30:00'
     - if: $COMPONENT == 'common' && ($BACKEND == 'dace_gpu' || $BACKEND == 'gtfn_gpu')
       variables:
-        SLURM_TIMELIMIT: '00:60:00'
+        # TODO(msimberg): This is very long, can we do better?
+        SLURM_TIMELIMIT: '01:30:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From 9b153ff9fe7e6f98407bc909de712e7927d85f5c Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 18 Mar 2026 15:43:10 +0100
Subject: [PATCH 47/68] More consistency for cupy/numpy, use cupy more
 extensively in serialbox.py

---
 .../mpi_tests/test_parallel_grid_manager.py   | 12 ++--
 .../test_parallel_grid_refinement.py          |  6 +-
 .../src/icon4py/model/testing/serialbox.py    | 57 +++++++++----------
 3 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 3c042df440..4c567629c6 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -798,22 +798,22 @@ def test_metrics_mask_prog_halo_c(
     )
 
     attrs_name = metrics_attributes.MASK_PROG_HALO_C
-    field = multi_rank_metrics.get(attrs_name).asnumpy()
-    c_refin_ctrl = multi_rank_metrics.get("c_refin_ctrl").asnumpy()
-    assert not np.any(
+    field = multi_rank_metrics.get(attrs_name).ndarray
+    c_refin_ctrl = multi_rank_metrics.get("c_refin_ctrl").ndarray
+    assert not (
         field[
             multi_rank_grid_manager.decomposition_info.local_index(
                 dims.CellDim, decomp_defs.DecompositionInfo.EntryType.OWNED
             )
         ]
-    ), f"rank={processor_props.rank} - found nonzero in owned entries of {attrs_name}"
+    ).any(), f"rank={processor_props.rank} - found nonzero in owned entries of {attrs_name}"
     halo_indices = multi_rank_grid_manager.decomposition_info.local_index(
         dims.CellDim, decomp_defs.DecompositionInfo.EntryType.HALO
     )
-    assert np.all(
+    assert (
         field[halo_indices]
         == ~((c_refin_ctrl[halo_indices] >= 1) & (c_refin_ctrl[halo_indices] <= 4))
-    ), f"rank={processor_props.rank} - halo for MASK_PROG_HALO_C is incorrect"
+    ).all(), f"rank={processor_props.rank} - halo for MASK_PROG_HALO_C is incorrect"
 
     _log.info(f"rank = {processor_props.rank} - DONE")
 
diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
index 80d1039154..a953113b47 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
@@ -18,6 +18,7 @@
     mpi_decomposition,
 )
 from icon4py.model.common.grid import grid_refinement, horizontal as h_grid
+from icon4py.model.common.utils import data_allocation as data_alloc
 from icon4py.model.testing import definitions, grid_utils, serialbox, test_utils
 from icon4py.model.testing.fixtures.datatest import (
     backend,
@@ -65,6 +66,7 @@ def test_compute_domain_bounds(
     experiment: definitions.Experiment,
     grid_savepoint: serialbox.IconGridSavepoint,
     processor_props: decomposition.ProcessProperties,
+    backend: gtx.typing.Backend | None,
 ) -> None:
     if (
         processor_props.is_single_rank()
@@ -75,11 +77,11 @@ def test_compute_domain_bounds(
             "end index data for single node APE are all 0 - re- serialization should fix that (patch%cells%end_index vs patch%cells%end_idx)"
         )
 
-    ref_grid = grid_savepoint.construct_icon_grid(backend=None, keep_skip_values=True)
+    ref_grid = grid_savepoint.construct_icon_grid(backend=backend, keep_skip_values=True)
     decomposition_info = grid_savepoint.construct_decomposition_info()
     refin_ctrl = {dim: grid_savepoint.refin_ctrl(dim) for dim in utils.main_horizontal_dims()}
     start_indices, end_indices = grid_refinement.compute_domain_bounds(
-        dim, refin_ctrl, decomposition_info
+        dim, refin_ctrl, decomposition_info, array_ns=data_alloc.import_array_ns(backend),
     )
     if (
         experiment == definitions.Experiments.GAUSS3D
diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py
index 84fc38a962..66c1149014 100644
--- a/model/testing/src/icon4py/model/testing/serialbox.py
+++ b/model/testing/src/icon4py/model/testing/serialbox.py
@@ -11,7 +11,6 @@
 
 import gt4py.next as gtx
 import gt4py.next.typing as gtx_typing
-import numpy as np
 import serialbox
 
 import icon4py.model.common.decomposition.definitions as decomposition
@@ -85,7 +84,7 @@ def log_meta_info(self):
         self.log.info(self.savepoint.metainfo)
 
     def _get_field(self, name, *dimensions, dtype=float):
-        buffer = np.squeeze(self.serializer.read(name, self.savepoint).astype(dtype))
+        buffer = self.xp.squeeze(self.serializer.read(name, self.savepoint).astype(dtype))
         buffer = self._reduce_to_dim_size(buffer, dimensions)
 
         self.log.debug(f"{name} {buffer.shape}")
@@ -93,7 +92,7 @@ def _get_field(self, name, *dimensions, dtype=float):
 
     def _get_field_component(self, name: str, level: int, dims: tuple[gtx.Dimension, gtx]):
         buffer = self.serializer.read(name, self.savepoint).astype(float)
-        buffer = np.squeeze(buffer)[:, :, level]
+        buffer = self.xp.squeeze(buffer)[:, :, level]
         buffer = self._reduce_to_dim_size(buffer, dims)
         self.log.debug(f"{name} {buffer.shape}")
         return gtx.as_field(dims, buffer, allocator=self.backend)
@@ -136,7 +135,7 @@ def _read_bool(self, name: str):
         return self._read(name, offset=0, dtype=bool)
 
     def _read(self, name: str, offset=0, dtype=int):
-        return np.squeeze(self.serializer.read(name, self.savepoint) - offset).astype(dtype)
+        return self.xp.asarray(self.xp.squeeze(self.serializer.read(name, self.savepoint) - offset).astype(dtype))
 
 
 class IconGridSavepoint(IconSavepoint):
@@ -365,35 +364,35 @@ def edge_cell_length(self):
 
     def cells_start_index(self):
         start_idx = self._read_int32("c_start_index")
-        return np.where(start_idx == 0, start_idx, start_idx - 1)
+        return self.xp.where(start_idx == 0, start_idx, start_idx - 1)
 
     def cells_end_index(self):
         return self._read_int32("c_end_index")
 
     def vertex_start_index(self):
         start_idx = self._read_int32("v_start_index")
-        return np.where(start_idx == 0, start_idx, start_idx - 1)
+        return self.xp.where(start_idx == 0, start_idx, start_idx - 1)
 
     def vertex_end_index(self):
         return self._read_int32("v_end_index")
 
     def edge_start_index(self):
         start_idx = self._read_int32("e_start_index")
-        return np.where(start_idx == 0, start_idx, start_idx - 1)
+        return self.xp.where(start_idx == 0, start_idx, start_idx - 1)
 
     def edge_end_index(self):
         # don't need to subtract 1, because FORTRAN slices  are inclusive [from:to] so the being
         # one off accounts for being exclusive [from:to)
         return self._read_int32("e_end_index")
 
-    def start_index(self) -> dict[gtx.Dimension, np.ndarray]:
+    def start_index(self) -> dict[gtx.Dimension, data_alloc.NDArray]:
         return {
             dims.CellDim: self.cells_start_index(),
             dims.EdgeDim: self.edge_start_index(),
             dims.VertexDim: self.vertex_start_index(),
         }
 
-    def end_index(self) -> dict[gtx.Dimension, np.ndarray]:
+    def end_index(self) -> dict[gtx.Dimension, data_alloc.NDArray]:
         return {
             dims.CellDim: self.cells_end_index(),
             dims.EdgeDim: self.edge_end_index(),
@@ -426,7 +425,7 @@ def c2e(self):
 
     def _get_connectivity_array(self, name: str, target_dim: gtx.Dimension, reverse: bool = False):
         if reverse:
-            connectivity = np.transpose(self._read_int32(name, offset=1))[
+            connectivity = self.xp.transpose(self._read_int32(name, offset=1))[
                 : self.sizes[target_dim], :
             ]
         else:
@@ -442,7 +441,7 @@ def e2c2e(self):
 
     def c2e2c2e(self):
         if self._c2e2c2e() is None:
-            return np.zeros((self.sizes[dims.CellDim], 9), dtype=gtx.int32)
+            return self.xp.zeros((self.sizes[dims.CellDim], 9), dtype=gtx.int32)
         else:
             return self._c2e2c2e()
 
@@ -501,7 +500,7 @@ def _read_field_for_dim(field_name, read_func, dim: gtx.Dimension):
                 )
 
     def owner_mask(self, dim: gtx.Dimension):
-        return np.squeeze(self._read_field_for_dim("owner_mask", self._read_bool, dim))
+        return self.xp.squeeze(self._read_field_for_dim("owner_mask", self._read_bool, dim))
 
     def global_index(self, dim: gtx.Dimension):
         return self._read_field_for_dim("glb_index", self._read_int32_shift1, dim)
@@ -552,8 +551,8 @@ def potentially_revert_icon_index_transformation(ar):
 
         c2e2c = self.c2e2c()
         e2c2e = potentially_revert_icon_index_transformation(self.e2c2e())
-        c2e2c0 = np.column_stack((range(c2e2c.shape[0]), c2e2c))
-        e2c2e0 = np.column_stack((range(e2c2e.shape[0]), e2c2e))
+        c2e2c0 = self.xp.column_stack((self.xp.asarray(range(c2e2c.shape[0])), c2e2c))
+        e2c2e0 = self.xp.column_stack((self.xp.asarray(range(e2c2e.shape[0])), e2c2e))
 
         constructor = functools.partial(
             h_grid.get_start_end_idx_from_icon_arrays,
@@ -648,7 +647,7 @@ def geofac_grdiv(self):
         return self._get_field("geofac_grdiv", dims.EdgeDim, dims.E2C2EODim)
 
     def geofac_grg(self):
-        grg = np.squeeze(self.serializer.read("geofac_grg", self.savepoint))
+        grg = self.xp.squeeze(self.serializer.read("geofac_grg", self.savepoint))
         num_cells = self.sizes[dims.CellDim]
         return gtx.as_field(
             (dims.CellDim, dims.C2E2CODim), grg[:num_cells, :, 0], allocator=self.backend
@@ -677,7 +676,7 @@ def rbf_vec_coeff_e(self):
     @IconSavepoint.optionally_registered()
     def rbf_vec_coeff_c1(self):
         dimensions = (dims.CellDim, dims.C2E2C2EDim)
-        buffer = np.squeeze(
+        buffer = self.xp.squeeze(
             self.serializer.read("rbf_vec_coeff_c1", self.savepoint).astype(float)
         ).transpose()
         buffer = self._reduce_to_dim_size(buffer, dimensions)
@@ -686,7 +685,7 @@ def rbf_vec_coeff_c1(self):
     @IconSavepoint.optionally_registered()
     def rbf_vec_coeff_c2(self):
         dimensions = (dims.CellDim, dims.C2E2C2EDim)
-        buffer = np.squeeze(
+        buffer = self.xp.squeeze(
             self.serializer.read("rbf_vec_coeff_c2", self.savepoint).astype(float)
         ).transpose()
         buffer = self._reduce_to_dim_size(buffer, dimensions)
@@ -739,15 +738,15 @@ def mask_prog_halo_c(self):
 
     @IconSavepoint.optionally_registered()
     def pg_edgeidx(self):
-        return np.squeeze(self.serializer.read("pg_edgeidx", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("pg_edgeidx", self.savepoint))
 
     @IconSavepoint.optionally_registered()
     def pg_vertidx(self):
-        return np.squeeze(self.serializer.read("pg_vertidx", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("pg_vertidx", self.savepoint))
 
     @IconSavepoint.optionally_registered()
     def pg_exdist(self):
-        return np.squeeze(self.serializer.read("pg_exdist", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("pg_exdist", self.savepoint))
 
     def pg_exdist_dsl(self):
         pg_edgeidx = self.pg_edgeidx()
@@ -891,12 +890,12 @@ def geopot(self):
 
     @IconSavepoint.optionally_registered()
     def zd_cellidx(self):
-        return np.squeeze(self.serializer.read("zd_cellidx", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("zd_cellidx", self.savepoint))
 
     @IconSavepoint.optionally_registered()
     def zd_vertidx(self):
         # this is the k list (with fortran 1-based indexing) for the central point of the C2E2C stencil
-        return np.squeeze(self.serializer.read("zd_vertidx", self.savepoint))[0, :]
+        return self.xp.squeeze(self.serializer.read("zd_vertidx", self.savepoint))[0, :]
 
     @IconSavepoint.optionally_registered(dims.CellDim, dims.C2E2CDim, dims.KDim, dtype=gtx.int32)
     def zd_vertoffset(self):
@@ -904,7 +903,7 @@ def zd_vertoffset(self):
         zd_vertidx = self.zd_vertidx()
         # these are the three k offsets for the C2E2C neighbors
         zd_vertoffset = (
-            np.squeeze(self.serializer.read("zd_vertidx", self.savepoint))[1:, :] - zd_vertidx
+            self.xp.squeeze(self.serializer.read("zd_vertidx", self.savepoint))[1:, :] - zd_vertidx
         )
         cell_c2e2c_k_domain = gtx.domain(
             {
@@ -929,7 +928,7 @@ def zd_vertoffset(self):
     def zd_intcoef(self):
         zd_cellidx = self.zd_cellidx()
         zd_vertidx = self.zd_vertidx()
-        zd_intcoef = np.squeeze(self.serializer.read("zd_intcoef", self.savepoint))
+        zd_intcoef = self.xp.squeeze(self.serializer.read("zd_intcoef", self.savepoint))
         cell_c2e2c_k_domain = gtx.domain(
             {
                 dims.CellDim: self.theta_ref_mc().domain[dims.CellDim].unit_range,
@@ -953,7 +952,7 @@ def zd_intcoef(self):
     def zd_diffcoef(self):
         zd_cellidx = self.zd_cellidx()
         zd_vertidx = self.zd_vertidx()
-        zd_diffcoef = np.squeeze(self.serializer.read("zd_diffcoef", self.savepoint))
+        zd_diffcoef = self.xp.squeeze(self.serializer.read("zd_diffcoef", self.savepoint))
         return data_alloc.list2field(
             domain=self.geopot().domain,
             values=zd_diffcoef,
@@ -1030,16 +1029,16 @@ def exner(self):
         return self._get_field("exner", dims.CellDim, dims.KDim)
 
     def diff_multfac_smag(self):
-        return np.squeeze(self.serializer.read("diff_multfac_smag", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("diff_multfac_smag", self.savepoint))
 
     def enh_smag_fac(self):
-        return np.squeeze(self.serializer.read("enh_smag_fac", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("enh_smag_fac", self.savepoint))
 
     def smag_limit(self):
-        return np.squeeze(self.serializer.read("smag_limit", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("smag_limit", self.savepoint))
 
     def diff_multfac_n2w(self):
-        return np.squeeze(self.serializer.read("diff_multfac_n2w", self.savepoint))
+        return self.xp.squeeze(self.serializer.read("diff_multfac_n2w", self.savepoint))
 
     def nudgezone_diff(self) -> int:
         return self.serializer.read("nudgezone_diff", self.savepoint)[0]

From ea3304fe741a369e4cb58d5efd80ace6f296249c Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 18 Mar 2026 15:55:30 +0100
Subject: [PATCH 48/68] Very long distributed gpu ci time limit

---
 ci/distributed.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index a023be0c21..cfb8825d63 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -99,7 +99,7 @@ build_distributed:
     - if: $COMPONENT == 'common' && ($BACKEND == 'dace_gpu' || $BACKEND == 'gtfn_gpu')
       variables:
         # TODO(msimberg): This is very long, can we do better?
-        SLURM_TIMELIMIT: '01:30:00'
+        SLURM_TIMELIMIT: '03:00:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From 89424c98da3e5cc533c768f06750140249cb7594 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 18 Mar 2026 16:14:08 +0100
Subject: [PATCH 49/68] Move check_local_global_field helper to common file for
 reuse elsewhere

---
 .../mpi_tests/test_parallel_grid_manager.py   | 119 +----------------
 .../icon4py/model/testing/parallel_helpers.py | 120 +++++++++++++++++-
 2 files changed, 121 insertions(+), 118 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 4c2100a931..1fb5869a5b 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -5,9 +5,7 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
-import functools
 import logging
-import operator
 
 import numpy as np
 import pytest
@@ -33,7 +31,7 @@
 from icon4py.model.common.metrics import metrics_attributes, metrics_factory
 from icon4py.model.common.states import utils as state_utils
 from icon4py.model.common.utils import data_allocation as data_alloc
-from icon4py.model.testing import definitions as test_defs, grid_utils, test_utils
+from icon4py.model.testing import definitions as test_defs, grid_utils, parallel_helpers, test_utils
 from icon4py.model.testing.fixtures.datatest import (
     backend,
     experiment,
@@ -85,115 +83,6 @@ def _get_neighbor_tables(grid: base.Grid) -> dict:
     }
 
 
-def gather_field(field: np.ndarray, props: decomp_defs.ProcessProperties) -> tuple:
-    constant_dims = tuple(field.shape[1:])
-    _log.info(f"gather_field on rank={props.rank} - gathering field of local shape {field.shape}")
-    # Because of sparse indexing the field may have a non-contigous layout,
-    # which Gatherv doesn't support. Make sure the field is contiguous.
-    field = np.ascontiguousarray(field)
-    constant_length = functools.reduce(operator.mul, constant_dims, 1)
-    local_sizes = np.array(props.comm.gather(field.size, root=0))
-    if props.rank == 0:
-        recv_buffer = np.empty(np.sum(local_sizes), dtype=field.dtype)
-        _log.info(
-            f"gather_field on rank = {props.rank} - setup receive buffer with size {sum(local_sizes)} on rank 0"
-        )
-    else:
-        recv_buffer = None
-
-    props.comm.Gatherv(sendbuf=field, recvbuf=(recv_buffer, local_sizes), root=0)
-    if props.rank == 0:
-        local_first_dim = tuple(sz // constant_length for sz in local_sizes)
-        _log.info(
-            f" gather_field on rank = 0: computed local dims {local_first_dim} - constant dims {constant_dims}"
-        )
-        gathered_field = recv_buffer.reshape((-1, *constant_dims))  # type: ignore [union-attr]
-    else:
-        gathered_field = None
-        local_first_dim = field.shape
-    return local_first_dim, gathered_field
-
-
-def check_local_global_field(
-    decomposition_info: decomp_defs.DecompositionInfo,
-    processor_props: decomp_defs.ProcessProperties,  # F811 # fixture
-    dim: gtx.Dimension,
-    global_reference_field: np.ndarray,
-    local_field: np.ndarray,
-    check_halos: bool,
-) -> None:
-    if dim == dims.KDim:
-        np.testing.assert_allclose(global_reference_field, local_field)
-        return
-
-    _log.info(
-        f" rank= {processor_props.rank}/{processor_props.comm_size}----exchanging field of main dim {dim}"
-    )
-    assert (
-        local_field.shape[0]
-        == decomposition_info.global_index(dim, decomp_defs.DecompositionInfo.EntryType.ALL).shape[
-            0
-        ]
-    )
-
-    # Compare halo against global reference field
-    if check_halos:
-        np.testing.assert_allclose(
-            global_reference_field[
-                data_alloc.as_numpy(
-                    decomposition_info.global_index(
-                        dim, decomp_defs.DecompositionInfo.EntryType.HALO
-                    )
-                )
-            ],
-            local_field[
-                data_alloc.as_numpy(
-                    decomposition_info.local_index(
-                        dim, decomp_defs.DecompositionInfo.EntryType.HALO
-                    )
-                )
-            ],
-            atol=1e-9,
-            verbose=True,
-        )
-
-    # Compare owned local field, excluding halos, against global reference
-    # field, by gathering owned entries to the first rank. This ensures that in
-    # total we have the full global field distributed on all ranks.
-    owned_entries = local_field[
-        data_alloc.as_numpy(
-            decomposition_info.local_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED)
-        )
-    ]
-    gathered_sizes, gathered_field = gather_field(owned_entries, processor_props)
-
-    global_index_sizes, gathered_global_indices = gather_field(
-        decomposition_info.global_index(dim, decomp_defs.DecompositionInfo.EntryType.OWNED),
-        processor_props,
-    )
-
-    if processor_props.rank == 0:
-        _log.info(f"rank = {processor_props.rank}: asserting gathered fields: ")
-
-        assert np.all(
-            gathered_sizes == global_index_sizes
-        ), f"gathered field sizes do not match:  {dim} {gathered_sizes} - {global_index_sizes}"
-        _log.info(
-            f"rank = {processor_props.rank}: Checking field size on dim ={dim}: --- gathered sizes {gathered_sizes} = {sum(gathered_sizes)}"
-        )
-        _log.info(
-            f"rank = {processor_props.rank}:                      --- gathered field has size {gathered_sizes}"
-        )
-        sorted_ = np.zeros(global_reference_field.shape, dtype=gtx.float64)  # type: ignore [attr-defined]
-        sorted_[gathered_global_indices] = gathered_field
-        _log.info(
-            f" rank = {processor_props.rank}: SHAPES: global reference field {global_reference_field.shape}, gathered = {gathered_field.shape}"
-        )
-
-        # TODO(msimberg): The tolerance is high only for RBF fields. Fix it.
-        np.testing.assert_allclose(sorted_, global_reference_field, atol=3e-9, verbose=True)
-
-
 # These fields can't be computed with the embedded backend for one reason or
 # another, so we declare them here for xfailing.
 embedded_broken_fields = {
@@ -332,7 +221,7 @@ def test_geometry_fields_compare_single_multi_rank(
     field = multi_rank_geometry.get(attrs_name)
     dim = field_ref.domain.dims[0]
 
-    check_local_global_field(
+    parallel_helpers.check_local_global_field(
         decomposition_info=multi_rank_grid_manager.decomposition_info,
         processor_props=processor_props,
         dim=dim,
@@ -449,7 +338,7 @@ def test_interpolation_fields_compare_single_multi_rank(
     field = multi_rank_interpolation.get(attrs_name)
     dim = field_ref.domain.dims[0]
 
-    check_local_global_field(
+    parallel_helpers.check_local_global_field(
         decomposition_info=multi_rank_grid_manager.decomposition_info,
         processor_props=processor_props,
         dim=dim,
@@ -682,7 +571,7 @@ def test_metrics_fields_compare_single_multi_rank(
         assert isinstance(field, state_utils.ScalarType)
         assert pytest.approx(field) == field_ref
     else:
-        check_local_global_field(
+        parallel_helpers.check_local_global_field(
             decomposition_info=multi_rank_grid_manager.decomposition_info,
             processor_props=processor_props,
             dim=field_ref.domain.dims[0],
diff --git a/model/testing/src/icon4py/model/testing/parallel_helpers.py b/model/testing/src/icon4py/model/testing/parallel_helpers.py
index b0ad1b0465..ea6aa1740e 100644
--- a/model/testing/src/icon4py/model/testing/parallel_helpers.py
+++ b/model/testing/src/icon4py/model/testing/parallel_helpers.py
@@ -5,15 +5,20 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
+import functools
 import logging
+import operator
 
+import numpy as np
 import pytest
+from gt4py import next as gtx
 
 from icon4py.model.common import dimension as dims
 from icon4py.model.common.decomposition import definitions
+from icon4py.model.common.utils import data_allocation as data_alloc
 
 
-log = logging.getLogger(__file__)
+_log = logging.getLogger(__file__)
 
 
 def check_comm_size(
@@ -24,12 +29,121 @@ def check_comm_size(
 
 
 def log_process_properties(props: definitions.ProcessProperties) -> None:
-    log.info(f"rank={props.rank}/{props.comm_size}")
+    _log.info(f"rank={props.rank}/{props.comm_size}")
 
 
 def log_local_field_size(decomposition_info: definitions.DecompositionInfo) -> None:
-    log.info(
+    _log.info(
         f"local grid size: cells={decomposition_info.global_index(dims.CellDim).size}, "
         f"edges={decomposition_info.global_index(dims.EdgeDim).size}, "
         f"vertices={decomposition_info.global_index(dims.VertexDim).size}"
     )
+
+
+def gather_field(field: np.ndarray, props: definitions.ProcessProperties) -> tuple:
+    constant_dims = tuple(field.shape[1:])
+    _log.info(f"gather_field on rank={props.rank} - gathering field of local shape {field.shape}")
+    # Because of sparse indexing the field may have a non-contigous layout,
+    # which Gatherv doesn't support. Make sure the field is contiguous.
+    field = np.ascontiguousarray(field)
+    constant_length = functools.reduce(operator.mul, constant_dims, 1)
+    local_sizes = np.array(props.comm.gather(field.size, root=0))
+    if props.rank == 0:
+        recv_buffer = np.empty(np.sum(local_sizes), dtype=field.dtype)
+        _log.info(
+            f"gather_field on rank = {props.rank} - setup receive buffer with size {sum(local_sizes)} on rank 0"
+        )
+    else:
+        recv_buffer = None
+
+    props.comm.Gatherv(sendbuf=field, recvbuf=(recv_buffer, local_sizes), root=0)
+    if props.rank == 0:
+        local_first_dim = tuple(sz // constant_length for sz in local_sizes)
+        _log.info(
+            f" gather_field on rank = 0: computed local dims {local_first_dim} - constant dims {constant_dims}"
+        )
+        gathered_field = recv_buffer.reshape((-1, *constant_dims))  # type: ignore [union-attr]
+    else:
+        gathered_field = None
+        local_first_dim = field.shape
+    return local_first_dim, gathered_field
+
+
+def check_local_global_field(
+    decomposition_info: definitions.DecompositionInfo,
+    processor_props: definitions.ProcessProperties,  # F811 # fixture
+    dim: gtx.Dimension,
+    global_reference_field: np.ndarray,
+    local_field: np.ndarray,
+    check_halos: bool,
+) -> None:
+    if dim == dims.KDim:
+        np.testing.assert_allclose(global_reference_field, local_field)
+        return
+
+    _log.info(
+        f" rank= {processor_props.rank}/{processor_props.comm_size}----exchanging field of main dim {dim}"
+    )
+    assert (
+        local_field.shape[0]
+        == decomposition_info.global_index(dim, definitions.DecompositionInfo.EntryType.ALL).shape[
+            0
+        ]
+    )
+
+    # Compare halo against global reference field
+    if check_halos:
+        np.testing.assert_allclose(
+            global_reference_field[
+                data_alloc.as_numpy(
+                    decomposition_info.global_index(
+                        dim, definitions.DecompositionInfo.EntryType.HALO
+                    )
+                )
+            ],
+            local_field[
+                data_alloc.as_numpy(
+                    decomposition_info.local_index(
+                        dim, definitions.DecompositionInfo.EntryType.HALO
+                    )
+                )
+            ],
+            atol=1e-9,
+            verbose=True,
+        )
+
+    # Compare owned local field, excluding halos, against global reference
+    # field, by gathering owned entries to the first rank. This ensures that in
+    # total we have the full global field distributed on all ranks.
+    owned_entries = local_field[
+        data_alloc.as_numpy(
+            decomposition_info.local_index(dim, definitions.DecompositionInfo.EntryType.OWNED)
+        )
+    ]
+    gathered_sizes, gathered_field = gather_field(owned_entries, processor_props)
+
+    global_index_sizes, gathered_global_indices = gather_field(
+        data_alloc.as_numpy(decomposition_info.global_index(dim, definitions.DecompositionInfo.EntryType.OWNED)),
+        processor_props,
+    )
+
+    if processor_props.rank == 0:
+        _log.info(f"rank = {processor_props.rank}: asserting gathered fields: ")
+
+        assert np.all(
+            gathered_sizes == global_index_sizes
+        ), f"gathered field sizes do not match:  {dim} {gathered_sizes} - {global_index_sizes}"
+        _log.info(
+            f"rank = {processor_props.rank}: Checking field size on dim ={dim}: --- gathered sizes {gathered_sizes} = {sum(gathered_sizes)}"
+        )
+        _log.info(
+            f"rank = {processor_props.rank}:                      --- gathered field has size {gathered_sizes}"
+        )
+        sorted_ = np.zeros(global_reference_field.shape, dtype=gtx.float64)  # type: ignore [attr-defined]
+        sorted_[gathered_global_indices] = gathered_field
+        _log.info(
+            f" rank = {processor_props.rank}: SHAPES: global reference field {global_reference_field.shape}, gathered = {gathered_field.shape}"
+        )
+
+        # TODO(msimberg): The tolerance is high only for RBF fields. Fix it.
+        np.testing.assert_allclose(sorted_, global_reference_field, atol=3e-9, verbose=True)

From 79dcdb38090fb9c86a8b5e055adbc9bd8ac19f01 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 18 Mar 2026 16:19:50 +0100
Subject: [PATCH 50/68] Add customizable tolerance to check_local_global_field

---
 .../common/grid/mpi_tests/test_parallel_grid_manager.py     | 3 +++
 model/testing/src/icon4py/model/testing/parallel_helpers.py | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 1fb5869a5b..b64240a9a7 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -228,6 +228,7 @@ def test_geometry_fields_compare_single_multi_rank(
         global_reference_field=field_ref.asnumpy(),
         local_field=field.asnumpy(),
         check_halos=True,
+        atol=0.0,
     )
 
     _log.info(f"rank = {processor_props.rank} - DONE")
@@ -345,6 +346,7 @@ def test_interpolation_fields_compare_single_multi_rank(
         global_reference_field=field_ref.asnumpy(),
         local_field=field.asnumpy(),
         check_halos=True,
+        atol=3e-9 if attrs_name.startswith("rbf") else 0.0,
     )
 
     _log.info(f"rank = {processor_props.rank} - DONE")
@@ -578,6 +580,7 @@ def test_metrics_fields_compare_single_multi_rank(
             global_reference_field=field_ref.asnumpy(),
             local_field=field.asnumpy(),
             check_halos=(attrs_name != metrics_attributes.WGTFAC_E),
+            atol=0.0,
         )
 
     _log.info(f"rank = {processor_props.rank} - DONE")
diff --git a/model/testing/src/icon4py/model/testing/parallel_helpers.py b/model/testing/src/icon4py/model/testing/parallel_helpers.py
index ea6aa1740e..8d4a24782f 100644
--- a/model/testing/src/icon4py/model/testing/parallel_helpers.py
+++ b/model/testing/src/icon4py/model/testing/parallel_helpers.py
@@ -76,6 +76,7 @@ def check_local_global_field(
     global_reference_field: np.ndarray,
     local_field: np.ndarray,
     check_halos: bool,
+    atol: float,
 ) -> None:
     if dim == dims.KDim:
         np.testing.assert_allclose(global_reference_field, local_field)
@@ -108,7 +109,7 @@ def check_local_global_field(
                     )
                 )
             ],
-            atol=1e-9,
+            atol=atol,
             verbose=True,
         )
 
@@ -145,5 +146,4 @@ def check_local_global_field(
             f" rank = {processor_props.rank}: SHAPES: global reference field {global_reference_field.shape}, gathered = {gathered_field.shape}"
         )
 
-        # TODO(msimberg): The tolerance is high only for RBF fields. Fix it.
-        np.testing.assert_allclose(sorted_, global_reference_field, atol=3e-9, verbose=True)
+        np.testing.assert_allclose(sorted_, global_reference_field, atol=atol, verbose=True)

From d555a26a9bcdfaf4c709a6a43b3c1ad253574d01 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 19 Mar 2026 11:30:36 +0100
Subject: [PATCH 51/68] numpy/cupy fixes

---
 .../mpi_tests/test_mpi_decomposition.py       | 18 +++----
 .../decomposition/unit_tests/test_halo.py     |  4 +-
 .../grid/unit_tests/test_grid_manager.py      | 48 +++++++++----------
 .../unit_tests/test_rbf_interpolation.py      |  6 +--
 .../math/unit_tests/test_smagorinsky.py       |  5 +-
 5 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
index 094fecd132..c2cb0d1ac5 100644
--- a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
+++ b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
@@ -290,9 +290,10 @@ def test_exchange_on_dummy_data(
     decomposition_info: definitions.DecompositionInfo,
     grid_savepoint: serialbox.IconGridSavepoint,
     dimension: gtx.Dimension,
+    backend: gtx.typing.Backend | None,
 ) -> None:
     exchange = definitions.create_exchange(processor_props, decomposition_info)
-    grid = grid_savepoint.construct_icon_grid()
+    grid = grid_savepoint.construct_icon_grid(backend=backend)
 
     number = processor_props.rank + 10
     input_field = data_alloc.constant_field(
@@ -300,15 +301,16 @@ def test_exchange_on_dummy_data(
         number,
         dimension,
         dims.KDim,
+        allocator=backend,
     )
 
-    halo_points = decomposition_info.local_index(
+    halo_points = data_alloc.as_numpy(decomposition_info.local_index(
         dimension, definitions.DecompositionInfo.EntryType.HALO
-    )
-    local_points = decomposition_info.local_index(
+    ))
+    local_points = data_alloc.as_numpy(decomposition_info.local_index(
         dimension, definitions.DecompositionInfo.EntryType.OWNED
-    )
-    assert np.all(input_field.asnumpy() == number)
+    ))
+    assert (input_field.ndarray == number).all()
     exchange.exchange(dimension, input_field, stream=definitions.BLOCK)
     result = input_field.asnumpy()
     _log.info(f"rank={processor_props.rank} - num of halo points ={halo_points.shape}")
@@ -319,8 +321,8 @@ def test_exchange_on_dummy_data(
     changed_points = np.argwhere(result[:, 2] != number)
     _log.info(f"rank={processor_props.rank} - num changed points {changed_points.shape} ")
 
-    assert np.all(result[local_points, :] == number)
-    assert np.all(result[halo_points, :] != number)
+    assert (result[local_points, :] == number).all()
+    assert (result[halo_points, :] != number).all()
 
 
 @pytest.mark.mpi
diff --git a/model/common/tests/common/decomposition/unit_tests/test_halo.py b/model/common/tests/common/decomposition/unit_tests/test_halo.py
index 420f1c20ba..d37a234edc 100644
--- a/model/common/tests/common/decomposition/unit_tests/test_halo.py
+++ b/model/common/tests/common/decomposition/unit_tests/test_halo.py
@@ -13,6 +13,7 @@
 from icon4py.model.common import dimension as dims, exceptions, model_backends
 from icon4py.model.common.decomposition import decomposer as decomp, definitions, halo
 from icon4py.model.common.grid import base as base_grid, simple
+from icon4py.model.common.utils import data_allocation as data_alloc
 from icon4py.model.testing import test_utils
 
 from ...fixtures import backend_like, processor_props
@@ -32,7 +33,8 @@ def test_halo_constructor_owned_cells(rank, simple_neighbor_tables, backend_like
         run_properties=processor_props,
         allocator=allocator,
     )
-    my_owned_cells = halo_generator.owned_cells(utils.SIMPLE_DISTRIBUTION)
+    xp = data_alloc.import_array_ns(allocator)
+    my_owned_cells = data_alloc.as_numpy(halo_generator.owned_cells(xp.asarray(utils.SIMPLE_DISTRIBUTION)))
 
     print(f"rank {processor_props.rank} owns {my_owned_cells} ")
     assert my_owned_cells.size == len(utils._CELL_OWN[processor_props.rank])
diff --git a/model/common/tests/common/grid/unit_tests/test_grid_manager.py b/model/common/tests/common/grid/unit_tests/test_grid_manager.py
index c809414e77..f3e2c4e7e5 100644
--- a/model/common/tests/common/grid/unit_tests/test_grid_manager.py
+++ b/model/common/tests/common/grid/unit_tests/test_grid_manager.py
@@ -74,7 +74,7 @@ def test_grid_manager_eval_v2e(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    seralized_v2e = grid_savepoint.v2e()
+    seralized_v2e = data_alloc.as_numpy(grid_savepoint.v2e())
     # there are vertices at the boundary of a local domain or at a pentagon point that have less than
     # 6 neighbors hence there are "Missing values" in the grid file
     # they get substituted by the "last valid index" in preprocessing step in icon.
@@ -120,7 +120,7 @@ def test_grid_manager_eval_v2c(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    serialized_v2c = grid_savepoint.v2c()
+    serialized_v2c = data_alloc.as_numpy(grid_savepoint.v2c())
     v2c_table = grid.get_connectivity("V2C").asnumpy()
     # there are vertices that have less than 6 neighboring cells: either pentagon points or
     # vertices at the boundary of the domain for a limited area mode
@@ -176,7 +176,7 @@ def test_grid_manager_eval_e2v(
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
 
-    serialized_e2v = grid_savepoint.e2v()
+    serialized_e2v = data_alloc.as_numpy(grid_savepoint.e2v())
     e2v_table = grid.get_connectivity("E2V").asnumpy()
     # all vertices in the system have to neighboring edges, there no edges that point nowhere
     # hence this connectivity has no "missing values" in the grid file
@@ -199,7 +199,7 @@ def test_grid_manager_eval_e2c(
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
 
-    serialized_e2c = grid_savepoint.e2c()
+    serialized_e2c = data_alloc.as_numpy(grid_savepoint.e2c())
     e2c_table = grid.get_connectivity("E2C").asnumpy()
     assert has_invalid_index(serialized_e2c) == grid.limited_area
     assert has_invalid_index(e2c_table) == grid.limited_area
@@ -216,7 +216,7 @@ def test_grid_manager_eval_c2e(
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
 
-    serialized_c2e = grid_savepoint.c2e()
+    serialized_c2e = data_alloc.as_numpy(grid_savepoint.c2e())
     c2e_table = grid.get_connectivity("C2E").asnumpy()
     # no cells with less than 3 neighboring edges exist, otherwise the cell is not there in the
     # first place
@@ -237,7 +237,7 @@ def test_grid_manager_eval_c2e2c(
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
     assert np.allclose(
         grid.get_connectivity("C2E2C").asnumpy(),
-        grid_savepoint.c2e2c(),
+        data_alloc.as_numpy(grid_savepoint.c2e2c()),
     )
 
 
@@ -249,7 +249,7 @@ def test_grid_manager_eval_c2e2cO(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    serialized_grid = grid_savepoint.construct_icon_grid()
+    serialized_grid = grid_savepoint.construct_icon_grid(backend=backend)
     assert np.allclose(
         grid.get_connectivity("C2E2CO").asnumpy(),
         serialized_grid.get_connectivity("C2E2CO").asnumpy(),
@@ -265,7 +265,7 @@ def test_grid_manager_eval_e2c2e(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    serialized_grid = grid_savepoint.construct_icon_grid()
+    serialized_grid = grid_savepoint.construct_icon_grid(backend=backend)
     serialized_e2c2e = serialized_grid.get_connectivity("E2C2E").asnumpy()
     serialized_e2c2eO = serialized_grid.get_connectivity("E2C2EO").asnumpy()
     assert has_invalid_index(serialized_e2c2e) == grid.limited_area
@@ -290,7 +290,7 @@ def test_grid_manager_eval_e2c2v(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    serialized_ref = grid_savepoint.e2c2v()
+    serialized_ref = data_alloc.as_numpy(grid_savepoint.e2c2v())
     # the "far" (adjacent to edge normal ) is not always there, because ICON only calculates those starting from
     #   (lateral_boundary(dims.EdgeDim) + 1) to end(dims.EdgeDim)  (see mo_intp_coeffs.f90) and only for owned cells
     table = grid.get_connectivity("E2C2V").asnumpy()
@@ -311,7 +311,7 @@ def test_grid_manager_eval_c2v(
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
     c2v = grid.get_connectivity("C2V").asnumpy()
-    assert np.allclose(c2v, grid_savepoint.c2v())
+    assert np.allclose(c2v, data_alloc.as_numpy(grid_savepoint.c2v()))
 
 
 @pytest.mark.parametrize(
@@ -397,7 +397,7 @@ def test_grid_manager_eval_c2e2c2e(
     backend: gtx_typing.Backend,
 ) -> None:
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
-    serialized_grid = grid_savepoint.construct_icon_grid()
+    serialized_grid = grid_savepoint.construct_icon_grid(backend=backend)
     assert np.allclose(
         grid.get_connectivity("C2E2C2E").asnumpy(),
         serialized_grid.get_connectivity("C2E2C2E").asnumpy(),
@@ -415,7 +415,7 @@ def test_grid_manager_start_end_index_compare_with_serialized_data(
     dim: gtx.Dimension,
     backend: gtx_typing.Backend,
 ) -> None:
-    serialized_grid = grid_savepoint.construct_icon_grid()
+    serialized_grid = grid_savepoint.construct_icon_grid(backend=backend)
     grid = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend).grid
 
     for domain in h_grid.get_domains_for_dim(dim):
@@ -469,11 +469,11 @@ def test_tangent_orientation(
     experiment: definitions.Experiment,
     backend: gtx_typing.Backend,
 ) -> None:
-    expected = grid_savepoint.tangent_orientation()
+    expected = data_alloc.as_numpy(grid_savepoint.tangent_orientation())
     manager = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend)
     assert test_utils.dallclose(
         manager.geometry_fields[gridfile.GeometryName.TANGENT_ORIENTATION].asnumpy(),
-        expected.asnumpy(),
+        expected,
     )
 
 
@@ -483,11 +483,11 @@ def test_edge_orientation_on_vertex(
     experiment: definitions.Experiment,
     backend: gtx_typing.Backend,
 ) -> None:
-    expected = grid_savepoint.vertex_edge_orientation()
+    expected = data_alloc.as_numpy(grid_savepoint.vertex_edge_orientation())
     manager = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend)
     assert test_utils.dallclose(
         manager.geometry_fields[gridfile.GeometryName.EDGE_ORIENTATION_ON_VERTEX].asnumpy(),
-        expected.asnumpy(),
+        expected,
     )
 
 
@@ -526,11 +526,11 @@ def test_cell_normal_orientation(
     experiment: definitions.Experiment,
     backend: gtx_typing.Backend,
 ) -> None:
-    expected = grid_savepoint.edge_orientation()
+    expected = data_alloc.as_numpy(grid_savepoint.edge_orientation())
     manager = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend)
     assert test_utils.dallclose(
         manager.geometry_fields[gridfile.GeometryName.CELL_NORMAL_ORIENTATION].asnumpy(),
-        expected.asnumpy(),
+        expected,
     )
 
 
@@ -540,12 +540,12 @@ def test_edge_vertex_distance(
     experiment: definitions.Experiment,
     backend: gtx_typing.Backend,
 ) -> None:
-    expected = grid_savepoint.edge_vert_length()
+    expected = data_alloc.as_numpy(grid_savepoint.edge_vert_length())
     manager = utils.run_grid_manager(experiment.grid, keep_skip_values=True, backend=backend)
 
     assert test_utils.dallclose(
         manager.geometry_fields[gridfile.GeometryName.EDGE_VERTEX_DISTANCE].asnumpy(),
-        expected.asnumpy(),
+        expected,
         equal_nan=True,
     )
 
@@ -574,10 +574,10 @@ def test_decomposition_info_single_rank(
     grid_file = experiment.grid
     gm = utils.run_grid_manager(grid_file, keep_skip_values=True, backend=backend)
     result = gm.decomposition_info
-    assert np.all(data_alloc.as_numpy(result.local_index(dim)) == expected.local_index(dim))
-    assert np.all(data_alloc.as_numpy(result.global_index(dim)) == expected.global_index(dim))
-    assert np.all(data_alloc.as_numpy(result.owner_mask(dim)) == expected.owner_mask(dim))
-    assert np.all(data_alloc.as_numpy(result.halo_levels(dim)) == expected.halo_levels(dim))
+    assert (result.local_index(dim) == expected.local_index(dim)).all()
+    assert (result.global_index(dim) == expected.global_index(dim)).all()
+    assert (result.owner_mask(dim) == expected.owner_mask(dim)).all()
+    assert (result.halo_levels(dim) == expected.halo_levels(dim)).all()
 
 
 @pytest.mark.parametrize("rank", (0, 1, 2, 3), ids=lambda rank: f"rank{rank}")
diff --git a/model/common/tests/common/interpolation/unit_tests/test_rbf_interpolation.py b/model/common/tests/common/interpolation/unit_tests/test_rbf_interpolation.py
index c6bfb0000b..5f57daaf85 100644
--- a/model/common/tests/common/interpolation/unit_tests/test_rbf_interpolation.py
+++ b/model/common/tests/common/interpolation/unit_tests/test_rbf_interpolation.py
@@ -79,7 +79,7 @@ def test_construct_rbf_matrix_offsets_tables_for_cells(
     )
     assert np.max(offset_table) == grid.num_edges - 1
 
-    offset_table_savepoint = grid_savepoint.c2e2c2e()
+    offset_table_savepoint = data_alloc.as_numpy(grid_savepoint.c2e2c2e())
     assert offset_table.shape == offset_table_savepoint.shape
 
     # Savepoint neighbors before start index may not be populated correctly,
@@ -111,7 +111,7 @@ def test_construct_rbf_matrix_offsets_tables_for_edges(
     )
     assert np.max(offset_table) == grid.num_edges - 1
 
-    offset_table_savepoint = grid_savepoint.e2c2e()
+    offset_table_savepoint = data_alloc.as_numpy(grid_savepoint.e2c2e())
     assert offset_table.shape == offset_table_savepoint.shape
 
     start_index = grid.start_index(
@@ -141,7 +141,7 @@ def test_construct_rbf_matrix_offsets_tables_for_vertices(
     )
     assert np.max(offset_table) == grid.num_edges - 1
 
-    offset_table_savepoint = grid_savepoint.v2e()
+    offset_table_savepoint = data_alloc.as_numpy(grid_savepoint.v2e())
     assert offset_table.shape == offset_table_savepoint.shape
 
     start_index = grid.start_index(
diff --git a/model/common/tests/common/math/unit_tests/test_smagorinsky.py b/model/common/tests/common/math/unit_tests/test_smagorinsky.py
index e43a97ab71..c8d673436a 100644
--- a/model/common/tests/common/math/unit_tests/test_smagorinsky.py
+++ b/model/common/tests/common/math/unit_tests/test_smagorinsky.py
@@ -24,9 +24,8 @@ def test_init_enh_smag_fac(backend_like: model_backends.BackendLike, grid: base_
     a_vec = data_alloc.random_field(
         grid, dims.KDim, low=1.0, high=10.0, extend={dims.KDim: 1}, allocator=backend
     )
-    xp = data_alloc.import_array_ns(backend)
-    fac = xp.asarray([0.67, 0.5, 1.3, 0.8])
-    z = xp.asarray([0.1, 0.2, 0.3, 0.4])
+    fac = np.asarray([0.67, 0.5, 1.3, 0.8])
+    z = np.asarray([0.1, 0.2, 0.3, 0.4])
 
     enhanced_smag_fac_np = enhanced_smagorinski_factor_numpy(fac, z, a_vec.asnumpy())
     en_smag_fac_for_zero_nshift.with_backend(backend)(

From 57d1694daa3e435f220bca08dffe386424674ce5 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 19 Mar 2026 14:21:46 +0100
Subject: [PATCH 52/68] Slightly loosen test_parallel_grid_manager.py
 tolerances again

---
 .../tests/common/grid/mpi_tests/test_parallel_grid_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index b64240a9a7..3399019422 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -228,7 +228,7 @@ def test_geometry_fields_compare_single_multi_rank(
         global_reference_field=field_ref.asnumpy(),
         local_field=field.asnumpy(),
         check_halos=True,
-        atol=0.0,
+        atol=1e-15,
     )
 
     _log.info(f"rank = {processor_props.rank} - DONE")
@@ -346,7 +346,7 @@ def test_interpolation_fields_compare_single_multi_rank(
         global_reference_field=field_ref.asnumpy(),
         local_field=field.asnumpy(),
         check_halos=True,
-        atol=3e-9 if attrs_name.startswith("rbf") else 0.0,
+        atol=3e-9 if attrs_name.startswith("rbf") else 1e-10 if attrs_name.startswith("pos_on_tplane") else 1e-15,
     )
 
     _log.info(f"rank = {processor_props.rank} - DONE")

From 61a3f45614ad26f3c3d2f2bd34eac4e37fbaf681 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 19 Mar 2026 14:33:00 +0100
Subject: [PATCH 53/68] print failures immediately in ci

---
 ci/distributed.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index cfb8825d63..a0797d5f90 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -84,7 +84,10 @@ build_distributed:
     - source ${UV_PROJECT_ENVIRONMENT}/bin/activate
     - echo "running with $(python --version)"
   script:
-    - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
+    # TODO
+    # - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
+    - uv pip install pytest-instafail
+    - pytest --instafail -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
   parallel:
     matrix:
       - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]

From fb209bb1ed42eacad9027384a4a70f60160e0232 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 19 Mar 2026 14:40:04 +0100
Subject: [PATCH 54/68] Fix formatting and linter warnings

---
 .../mpi_tests/test_mpi_decomposition.py              | 12 ++++++------
 .../common/decomposition/unit_tests/test_halo.py     |  4 +++-
 .../grid/mpi_tests/test_parallel_grid_manager.py     |  8 ++++++--
 .../grid/mpi_tests/test_parallel_grid_refinement.py  |  5 ++++-
 .../tests/common/math/unit_tests/test_smagorinsky.py |  4 ++--
 .../src/icon4py/model/testing/parallel_helpers.py    |  4 +++-
 model/testing/src/icon4py/model/testing/serialbox.py |  4 +++-
 7 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
index c2cb0d1ac5..3f66caca96 100644
--- a/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
+++ b/model/common/tests/common/decomposition/mpi_tests/test_mpi_decomposition.py
@@ -304,12 +304,12 @@ def test_exchange_on_dummy_data(
         allocator=backend,
     )
 
-    halo_points = data_alloc.as_numpy(decomposition_info.local_index(
-        dimension, definitions.DecompositionInfo.EntryType.HALO
-    ))
-    local_points = data_alloc.as_numpy(decomposition_info.local_index(
-        dimension, definitions.DecompositionInfo.EntryType.OWNED
-    ))
+    halo_points = data_alloc.as_numpy(
+        decomposition_info.local_index(dimension, definitions.DecompositionInfo.EntryType.HALO)
+    )
+    local_points = data_alloc.as_numpy(
+        decomposition_info.local_index(dimension, definitions.DecompositionInfo.EntryType.OWNED)
+    )
     assert (input_field.ndarray == number).all()
     exchange.exchange(dimension, input_field, stream=definitions.BLOCK)
     result = input_field.asnumpy()
diff --git a/model/common/tests/common/decomposition/unit_tests/test_halo.py b/model/common/tests/common/decomposition/unit_tests/test_halo.py
index d37a234edc..2145fe6be1 100644
--- a/model/common/tests/common/decomposition/unit_tests/test_halo.py
+++ b/model/common/tests/common/decomposition/unit_tests/test_halo.py
@@ -34,7 +34,9 @@ def test_halo_constructor_owned_cells(rank, simple_neighbor_tables, backend_like
         allocator=allocator,
     )
     xp = data_alloc.import_array_ns(allocator)
-    my_owned_cells = data_alloc.as_numpy(halo_generator.owned_cells(xp.asarray(utils.SIMPLE_DISTRIBUTION)))
+    my_owned_cells = data_alloc.as_numpy(
+        halo_generator.owned_cells(xp.asarray(utils.SIMPLE_DISTRIBUTION))
+    )
 
     print(f"rank {processor_props.rank} owns {my_owned_cells} ")
     assert my_owned_cells.size == len(utils._CELL_OWN[processor_props.rank])
diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 3399019422..680697936d 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -346,7 +346,11 @@ def test_interpolation_fields_compare_single_multi_rank(
         global_reference_field=field_ref.asnumpy(),
         local_field=field.asnumpy(),
         check_halos=True,
-        atol=3e-9 if attrs_name.startswith("rbf") else 1e-10 if attrs_name.startswith("pos_on_tplane") else 1e-15,
+        atol=3e-9
+        if attrs_name.startswith("rbf")
+        else 1e-10
+        if attrs_name.startswith("pos_on_tplane")
+        else 1e-15,
     )
 
     _log.info(f"rank = {processor_props.rank} - DONE")
@@ -716,7 +720,7 @@ def test_metrics_mask_prog_halo_c(
     )
     assert (
         field[halo_indices]
-        == ~((c_refin_ctrl[halo_indices] >= 1) & (c_refin_ctrl[halo_indices] <= 4))
+        == xp.invert((c_refin_ctrl[halo_indices] >= 1) & (c_refin_ctrl[halo_indices] <= 4))
     ).all(), f"rank={processor_props.rank} - halo for MASK_PROG_HALO_C is incorrect"
 
     _log.info(f"rank = {processor_props.rank} - DONE")
diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
index a953113b47..b7dae66a95 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_refinement.py
@@ -81,7 +81,10 @@ def test_compute_domain_bounds(
     decomposition_info = grid_savepoint.construct_decomposition_info()
     refin_ctrl = {dim: grid_savepoint.refin_ctrl(dim) for dim in utils.main_horizontal_dims()}
     start_indices, end_indices = grid_refinement.compute_domain_bounds(
-        dim, refin_ctrl, decomposition_info, array_ns=data_alloc.import_array_ns(backend),
+        dim,
+        refin_ctrl,
+        decomposition_info,
+        array_ns=data_alloc.import_array_ns(backend),
     )
     if (
         experiment == definitions.Experiments.GAUSS3D
diff --git a/model/common/tests/common/math/unit_tests/test_smagorinsky.py b/model/common/tests/common/math/unit_tests/test_smagorinsky.py
index c8d673436a..4c0e27e116 100644
--- a/model/common/tests/common/math/unit_tests/test_smagorinsky.py
+++ b/model/common/tests/common/math/unit_tests/test_smagorinsky.py
@@ -24,8 +24,8 @@ def test_init_enh_smag_fac(backend_like: model_backends.BackendLike, grid: base_
     a_vec = data_alloc.random_field(
         grid, dims.KDim, low=1.0, high=10.0, extend={dims.KDim: 1}, allocator=backend
     )
-    fac = np.asarray([0.67, 0.5, 1.3, 0.8])
-    z = np.asarray([0.1, 0.2, 0.3, 0.4])
+    fac = (0.67, 0.5, 1.3, 0.8)
+    z = (0.1, 0.2, 0.3, 0.4)
 
     enhanced_smag_fac_np = enhanced_smagorinski_factor_numpy(fac, z, a_vec.asnumpy())
     en_smag_fac_for_zero_nshift.with_backend(backend)(
diff --git a/model/testing/src/icon4py/model/testing/parallel_helpers.py b/model/testing/src/icon4py/model/testing/parallel_helpers.py
index 8d4a24782f..43a5e99caa 100644
--- a/model/testing/src/icon4py/model/testing/parallel_helpers.py
+++ b/model/testing/src/icon4py/model/testing/parallel_helpers.py
@@ -124,7 +124,9 @@ def check_local_global_field(
     gathered_sizes, gathered_field = gather_field(owned_entries, processor_props)
 
     global_index_sizes, gathered_global_indices = gather_field(
-        data_alloc.as_numpy(decomposition_info.global_index(dim, definitions.DecompositionInfo.EntryType.OWNED)),
+        data_alloc.as_numpy(
+            decomposition_info.global_index(dim, definitions.DecompositionInfo.EntryType.OWNED)
+        ),
         processor_props,
     )
 
diff --git a/model/testing/src/icon4py/model/testing/serialbox.py b/model/testing/src/icon4py/model/testing/serialbox.py
index 151c71d1a5..bf609f41b1 100644
--- a/model/testing/src/icon4py/model/testing/serialbox.py
+++ b/model/testing/src/icon4py/model/testing/serialbox.py
@@ -135,7 +135,9 @@ def _read_bool(self, name: str):
         return self._read(name, offset=0, dtype=bool)
 
     def _read(self, name: str, offset=0, dtype=int):
-        return self.xp.asarray(self.xp.squeeze(self.serializer.read(name, self.savepoint) - offset).astype(dtype))
+        return self.xp.asarray(
+            self.xp.squeeze(self.serializer.read(name, self.savepoint) - offset).astype(dtype)
+        )
 
 
 class IconGridSavepoint(IconSavepoint):

From a5c633ff13a930372a4b9765cb0848abd66fc1b3 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 20 Mar 2026 13:15:34 +0100
Subject: [PATCH 55/68] Make some field tests unit tests in
 test_parallel_grid_manager.py

---
 .../mpi_tests/test_parallel_grid_manager.py   | 212 +++++++++++-------
 1 file changed, 135 insertions(+), 77 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 680697936d..196cadf5fe 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -234,34 +234,7 @@ def test_geometry_fields_compare_single_multi_rank(
     _log.info(f"rank = {processor_props.rank} - DONE")
 
 
-@pytest.mark.mpi
-@pytest.mark.parametrize("processor_props", [True], indirect=True)
-@pytest.mark.parametrize(
-    "attrs_name",
-    [
-        interpolation_attributes.CELL_AW_VERTS,
-        interpolation_attributes.C_BLN_AVG,
-        interpolation_attributes.C_LIN_E,
-        interpolation_attributes.E_BLN_C_S,
-        interpolation_attributes.E_FLX_AVG,
-        interpolation_attributes.GEOFAC_DIV,
-        interpolation_attributes.GEOFAC_GRDIV,
-        interpolation_attributes.GEOFAC_GRG_X,
-        interpolation_attributes.GEOFAC_GRG_Y,
-        interpolation_attributes.GEOFAC_N2S,
-        interpolation_attributes.GEOFAC_ROT,
-        interpolation_attributes.LSQ_PSEUDOINV,
-        interpolation_attributes.NUDGECOEFFS_E,
-        interpolation_attributes.POS_ON_TPLANE_E_X,
-        interpolation_attributes.POS_ON_TPLANE_E_Y,
-        interpolation_attributes.RBF_VEC_COEFF_C1,
-        interpolation_attributes.RBF_VEC_COEFF_C2,
-        interpolation_attributes.RBF_VEC_COEFF_E,
-        interpolation_attributes.RBF_VEC_COEFF_V1,
-        interpolation_attributes.RBF_VEC_COEFF_V2,
-    ],
-)
-def test_interpolation_fields_compare_single_multi_rank(
+def _compare_interpolation_fields_single_multi_rank(
     processor_props: decomp_defs.ProcessProperties,
     backend: gtx_typing.Backend | None,
     experiment: test_defs.Experiment,
@@ -356,62 +329,65 @@ def test_interpolation_fields_compare_single_multi_rank(
     _log.info(f"rank = {processor_props.rank} - DONE")
 
 
+@pytest.mark.level("unit")
 @pytest.mark.mpi
 @pytest.mark.parametrize("processor_props", [True], indirect=True)
 @pytest.mark.parametrize(
     "attrs_name",
     [
-        metrics_attributes.CELL_HEIGHT_ON_HALF_LEVEL,
-        metrics_attributes.COEFF1_DWDZ,
-        metrics_attributes.COEFF2_DWDZ,
-        metrics_attributes.COEFF_GRADEKIN,
-        metrics_attributes.D2DEXDZ2_FAC1_MC,
-        metrics_attributes.D2DEXDZ2_FAC2_MC,
-        metrics_attributes.DDQZ_Z_FULL,
-        metrics_attributes.DDQZ_Z_FULL_E,
-        metrics_attributes.DDQZ_Z_HALF,
-        metrics_attributes.DDXN_Z_FULL,
-        metrics_attributes.DDXN_Z_HALF_E,
-        metrics_attributes.DDXT_Z_FULL,
-        metrics_attributes.DDXT_Z_HALF_E,
-        metrics_attributes.D_EXNER_DZ_REF_IC,
-        metrics_attributes.EXNER_EXFAC,
-        metrics_attributes.EXNER_REF_MC,
-        metrics_attributes.EXNER_W_EXPLICIT_WEIGHT_PARAMETER,
-        metrics_attributes.EXNER_W_IMPLICIT_WEIGHT_PARAMETER,
-        metrics_attributes.FLAT_IDX_MAX,
-        metrics_attributes.HORIZONTAL_MASK_FOR_3D_DIVDAMP,
-        metrics_attributes.INV_DDQZ_Z_FULL,
-        metrics_attributes.MAXHGTD,
-        metrics_attributes.MAXHGTD_AVG,
-        metrics_attributes.MAXSLP,
-        metrics_attributes.MAXSLP_AVG,
-        metrics_attributes.MAX_NBHGT,
-        metrics_attributes.NFLAT_GRADP,
-        metrics_attributes.PG_EXDIST_DSL,
-        metrics_attributes.RAYLEIGH_W,
-        metrics_attributes.RHO_REF_MC,
-        metrics_attributes.RHO_REF_ME,
-        metrics_attributes.SCALING_FACTOR_FOR_3D_DIVDAMP,
-        metrics_attributes.DEEPATMO_DIVH,
-        metrics_attributes.DEEPATMO_DIVZL,
-        metrics_attributes.DEEPATMO_DIVZU,
-        metrics_attributes.THETA_REF_IC,
-        metrics_attributes.THETA_REF_MC,
-        metrics_attributes.THETA_REF_ME,
-        metrics_attributes.VERTOFFSET_GRADP,
-        metrics_attributes.WGTFACQ_C,
-        metrics_attributes.WGTFACQ_E,
-        metrics_attributes.WGTFAC_C,
-        metrics_attributes.WGTFAC_E,
-        metrics_attributes.ZDIFF_GRADP,
-        metrics_attributes.ZD_DIFFCOEF,
-        metrics_attributes.ZD_INTCOEF,
-        metrics_attributes.ZD_VERTOFFSET,
-        metrics_attributes.Z_MC,
+        interpolation_attributes.CELL_AW_VERTS,
+        interpolation_attributes.C_BLN_AVG,
+        interpolation_attributes.C_LIN_E,
+        interpolation_attributes.E_BLN_C_S,
+        interpolation_attributes.GEOFAC_DIV,
+        interpolation_attributes.GEOFAC_ROT,
+        interpolation_attributes.LSQ_PSEUDOINV,
+        interpolation_attributes.NUDGECOEFFS_E,
+        interpolation_attributes.POS_ON_TPLANE_E_X,
+        interpolation_attributes.POS_ON_TPLANE_E_Y,
+    ],
+)
+def test_interpolation_fields_compare_single_multi_rank_unit(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    experiment: test_defs.Experiment,
+    attrs_name: str,
+) -> None:
+    _compare_interpolation_fields_single_multi_rank(
+        processor_props, backend, experiment, attrs_name
+    )
+
+
+@pytest.mark.level("integration")
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+@pytest.mark.parametrize(
+    "attrs_name",
+    [
+        interpolation_attributes.E_FLX_AVG,
+        interpolation_attributes.GEOFAC_GRDIV,
+        interpolation_attributes.GEOFAC_GRG_X,
+        interpolation_attributes.GEOFAC_GRG_Y,
+        interpolation_attributes.GEOFAC_N2S,
+        interpolation_attributes.RBF_VEC_COEFF_C1,
+        interpolation_attributes.RBF_VEC_COEFF_C2,
+        interpolation_attributes.RBF_VEC_COEFF_E,
+        interpolation_attributes.RBF_VEC_COEFF_V1,
+        interpolation_attributes.RBF_VEC_COEFF_V2,
     ],
 )
-def test_metrics_fields_compare_single_multi_rank(
+def test_interpolation_fields_compare_single_multi_rank_integration(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    experiment: test_defs.Experiment,
+    attrs_name: str,
+) -> None:
+    _compare_interpolation_fields_single_multi_rank(
+        processor_props, backend, experiment, attrs_name
+    )
+
+
+def _compare_metrics_fields_single_multi_rank(
     processor_props: decomp_defs.ProcessProperties,
     backend: gtx_typing.Backend | None,
     experiment: test_defs.Experiment,
@@ -590,6 +566,88 @@ def test_metrics_fields_compare_single_multi_rank(
     _log.info(f"rank = {processor_props.rank} - DONE")
 
 
+@pytest.mark.level("unit")
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+@pytest.mark.parametrize(
+    "attrs_name",
+    [
+        metrics_attributes.CELL_HEIGHT_ON_HALF_LEVEL,
+        metrics_attributes.COEFF_GRADEKIN,
+        metrics_attributes.DDQZ_Z_FULL,
+        metrics_attributes.DDXN_Z_HALF_E,
+        metrics_attributes.DDXT_Z_HALF_E,
+        metrics_attributes.D_EXNER_DZ_REF_IC,
+        metrics_attributes.EXNER_REF_MC,
+        metrics_attributes.EXNER_W_IMPLICIT_WEIGHT_PARAMETER,
+        metrics_attributes.FLAT_IDX_MAX,
+        metrics_attributes.HORIZONTAL_MASK_FOR_3D_DIVDAMP,
+        metrics_attributes.INV_DDQZ_Z_FULL,
+        metrics_attributes.MAXHGTD,
+        metrics_attributes.MAXSLP,
+        metrics_attributes.MAX_NBHGT,
+        metrics_attributes.PG_EXDIST_DSL,
+        metrics_attributes.RAYLEIGH_W,
+        metrics_attributes.RHO_REF_MC,
+        metrics_attributes.RHO_REF_ME,
+        metrics_attributes.SCALING_FACTOR_FOR_3D_DIVDAMP,
+        metrics_attributes.DEEPATMO_DIVH,
+        metrics_attributes.DEEPATMO_DIVZL,
+        metrics_attributes.DEEPATMO_DIVZU,
+        metrics_attributes.THETA_REF_IC,
+        metrics_attributes.THETA_REF_MC,
+        metrics_attributes.THETA_REF_ME,
+        metrics_attributes.VERTOFFSET_GRADP,
+        metrics_attributes.WGTFACQ_C,
+        metrics_attributes.WGTFAC_C,
+        metrics_attributes.ZDIFF_GRADP,
+        metrics_attributes.ZD_DIFFCOEF,
+        metrics_attributes.Z_MC,
+    ],
+)
+def test_metrics_fields_compare_single_multi_rank_unit(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    experiment: test_defs.Experiment,
+    attrs_name: str,
+) -> None:
+    _compare_metrics_fields_single_multi_rank(processor_props, backend, experiment, attrs_name)
+
+
+@pytest.mark.level("integration")
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+@pytest.mark.parametrize(
+    "attrs_name",
+    [
+        metrics_attributes.COEFF1_DWDZ,
+        metrics_attributes.COEFF2_DWDZ,
+        metrics_attributes.D2DEXDZ2_FAC1_MC,
+        metrics_attributes.D2DEXDZ2_FAC2_MC,
+        metrics_attributes.DDQZ_Z_FULL_E,
+        metrics_attributes.DDQZ_Z_HALF,
+        metrics_attributes.DDXN_Z_FULL,
+        metrics_attributes.DDXT_Z_FULL,
+        metrics_attributes.EXNER_EXFAC,
+        metrics_attributes.EXNER_W_EXPLICIT_WEIGHT_PARAMETER,
+        metrics_attributes.MAXHGTD_AVG,
+        metrics_attributes.MAXSLP_AVG,
+        metrics_attributes.NFLAT_GRADP,
+        metrics_attributes.WGTFACQ_E,
+        metrics_attributes.WGTFAC_E,
+        metrics_attributes.ZD_INTCOEF,
+        metrics_attributes.ZD_VERTOFFSET,
+    ],
+)
+def test_metrics_fields_compare_single_multi_rank_integration(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    experiment: test_defs.Experiment,
+    attrs_name: str,
+) -> None:
+    _compare_metrics_fields_single_multi_rank(processor_props, backend, experiment, attrs_name)
+
+
 # MASK_PROG_HALO_C is defined specially only on halos, so we have a separate
 # test for it. It doesn't make sense to compare to a single-rank reference since
 # it has no halos.

From 20c0ea1166ef9f20a631b7ebfb8b442fbaf2fc77 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 20 Mar 2026 15:30:50 +0100
Subject: [PATCH 56/68] Don't test r01b01 grid anymore

---
 model/testing/src/icon4py/model/testing/fixtures/datatest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/model/testing/src/icon4py/model/testing/fixtures/datatest.py b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
index 7730656d5a..8b07dd27b9 100644
--- a/model/testing/src/icon4py/model/testing/fixtures/datatest.py
+++ b/model/testing/src/icon4py/model/testing/fixtures/datatest.py
@@ -81,7 +81,6 @@ def cpu_allocator() -> gtx_typing.Allocator:
 
 @pytest.fixture(
     params=[
-        definitions.Grids.R01B01_GLOBAL,
         definitions.Grids.R02B04_GLOBAL,
         definitions.Grids.MCH_CH_R04B09_DSL,
         definitions.Grids.TORUS_50000x5000,

From 7cccfef7757f64603e9fe7d045556bbc7d5aea59 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 20 Mar 2026 16:16:05 +0100
Subject: [PATCH 57/68] Split geometry fields test in
 test_parallel_grid_manager.py into unit and integration test

---
 .../mpi_tests/test_parallel_grid_manager.py   | 165 +++++++++++-------
 1 file changed, 98 insertions(+), 67 deletions(-)

diff --git a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
index 196cadf5fe..c062594487 100644
--- a/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
+++ b/model/common/tests/common/grid/mpi_tests/test_parallel_grid_manager.py
@@ -102,62 +102,7 @@ def _get_neighbor_tables(grid: base.Grid) -> dict:
 }
 
 
-@pytest.mark.mpi
-@pytest.mark.parametrize("processor_props", [True], indirect=True)
-@pytest.mark.parametrize(
-    "attrs_name",
-    [
-        geometry_attributes.CELL_AREA,
-        geometry_attributes.CELL_CENTER_X,
-        geometry_attributes.CELL_CENTER_Y,
-        geometry_attributes.CELL_CENTER_Z,
-        geometry_attributes.CELL_LAT,
-        geometry_attributes.CELL_LON,
-        geometry_attributes.CELL_NORMAL_ORIENTATION,
-        geometry_attributes.CORIOLIS_PARAMETER,
-        geometry_attributes.DUAL_AREA,
-        geometry_attributes.DUAL_EDGE_LENGTH,
-        f"inverse_of_{geometry_attributes.DUAL_EDGE_LENGTH}",
-        geometry_attributes.EDGE_AREA,
-        geometry_attributes.EDGE_CELL_DISTANCE,
-        geometry_attributes.EDGE_CENTER_X,
-        geometry_attributes.EDGE_CENTER_Y,
-        geometry_attributes.EDGE_CENTER_Z,
-        geometry_attributes.EDGE_DUAL_U,
-        geometry_attributes.EDGE_DUAL_V,
-        geometry_attributes.EDGE_LAT,
-        f"inverse_of_{geometry_attributes.EDGE_LENGTH}",
-        geometry_attributes.EDGE_LENGTH,
-        geometry_attributes.EDGE_LON,
-        geometry_attributes.EDGE_NORMAL_CELL_U,
-        geometry_attributes.EDGE_NORMAL_CELL_V,
-        geometry_attributes.EDGE_NORMAL_U,
-        geometry_attributes.EDGE_NORMAL_V,
-        geometry_attributes.EDGE_NORMAL_VERTEX_U,
-        geometry_attributes.EDGE_NORMAL_VERTEX_V,
-        geometry_attributes.EDGE_NORMAL_X,
-        geometry_attributes.EDGE_NORMAL_Y,
-        geometry_attributes.EDGE_NORMAL_Z,
-        geometry_attributes.EDGE_TANGENT_CELL_U,
-        geometry_attributes.EDGE_TANGENT_CELL_V,
-        geometry_attributes.EDGE_TANGENT_VERTEX_U,
-        geometry_attributes.EDGE_TANGENT_VERTEX_V,
-        geometry_attributes.EDGE_TANGENT_X,
-        geometry_attributes.EDGE_TANGENT_Y,
-        geometry_attributes.EDGE_TANGENT_Z,
-        geometry_attributes.EDGE_VERTEX_DISTANCE,
-        geometry_attributes.TANGENT_ORIENTATION,
-        geometry_attributes.VERTEX_EDGE_ORIENTATION,
-        geometry_attributes.VERTEX_LAT,
-        geometry_attributes.VERTEX_LON,
-        geometry_attributes.VERTEX_VERTEX_LENGTH,
-        f"inverse_of_{geometry_attributes.VERTEX_VERTEX_LENGTH}",
-        geometry_attributes.VERTEX_X,
-        geometry_attributes.VERTEX_Y,
-        geometry_attributes.VERTEX_Z,
-    ],
-)
-def test_geometry_fields_compare_single_multi_rank(
+def _compare_geometry_fields_single_multi_rank(
     processor_props: decomp_defs.ProcessProperties,
     backend: gtx_typing.Backend | None,
     grid_description: test_defs.GridDescription,
@@ -234,6 +179,92 @@ def test_geometry_fields_compare_single_multi_rank(
     _log.info(f"rank = {processor_props.rank} - DONE")
 
 
+@pytest.mark.level("unit")
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+@pytest.mark.parametrize(
+    "attrs_name",
+    [
+        geometry_attributes.CELL_CENTER_Y,
+        geometry_attributes.CELL_CENTER_Z,
+        geometry_attributes.CELL_LON,
+        geometry_attributes.DUAL_EDGE_LENGTH,
+        geometry_attributes.EDGE_CENTER_Y,
+        geometry_attributes.EDGE_CENTER_Z,
+        geometry_attributes.EDGE_DUAL_V,
+        geometry_attributes.EDGE_LENGTH,
+        geometry_attributes.EDGE_LON,
+        geometry_attributes.EDGE_NORMAL_CELL_V,
+        geometry_attributes.EDGE_NORMAL_V,
+        geometry_attributes.EDGE_NORMAL_VERTEX_V,
+        geometry_attributes.EDGE_NORMAL_Y,
+        geometry_attributes.EDGE_NORMAL_Z,
+        geometry_attributes.EDGE_TANGENT_CELL_V,
+        geometry_attributes.EDGE_TANGENT_VERTEX_V,
+        geometry_attributes.EDGE_TANGENT_Y,
+        geometry_attributes.EDGE_TANGENT_Z,
+        geometry_attributes.VERTEX_LON,
+        geometry_attributes.VERTEX_VERTEX_LENGTH,
+        geometry_attributes.VERTEX_Y,
+        geometry_attributes.VERTEX_Z,
+    ],
+)
+def test_geometry_fields_compare_single_multi_rank_unit(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    grid_description: test_defs.GridDescription,
+    attrs_name: str,
+) -> None:
+    _compare_geometry_fields_single_multi_rank(
+        processor_props, backend, grid_description, attrs_name
+    )
+
+
+@pytest.mark.level("integration")
+@pytest.mark.mpi
+@pytest.mark.parametrize("processor_props", [True], indirect=True)
+@pytest.mark.parametrize(
+    "attrs_name",
+    [
+        geometry_attributes.CELL_AREA,
+        geometry_attributes.CELL_CENTER_X,
+        geometry_attributes.CELL_LAT,
+        geometry_attributes.CELL_NORMAL_ORIENTATION,
+        geometry_attributes.CORIOLIS_PARAMETER,
+        geometry_attributes.DUAL_AREA,
+        f"inverse_of_{geometry_attributes.DUAL_EDGE_LENGTH}",
+        geometry_attributes.EDGE_AREA,
+        geometry_attributes.EDGE_CELL_DISTANCE,
+        geometry_attributes.EDGE_CENTER_X,
+        geometry_attributes.EDGE_DUAL_U,
+        geometry_attributes.EDGE_LAT,
+        f"inverse_of_{geometry_attributes.EDGE_LENGTH}",
+        geometry_attributes.EDGE_NORMAL_CELL_U,
+        geometry_attributes.EDGE_NORMAL_U,
+        geometry_attributes.EDGE_NORMAL_VERTEX_U,
+        geometry_attributes.EDGE_NORMAL_X,
+        geometry_attributes.EDGE_TANGENT_CELL_U,
+        geometry_attributes.EDGE_TANGENT_VERTEX_U,
+        geometry_attributes.EDGE_TANGENT_X,
+        geometry_attributes.EDGE_VERTEX_DISTANCE,
+        geometry_attributes.TANGENT_ORIENTATION,
+        geometry_attributes.VERTEX_EDGE_ORIENTATION,
+        geometry_attributes.VERTEX_LAT,
+        f"inverse_of_{geometry_attributes.VERTEX_VERTEX_LENGTH}",
+        geometry_attributes.VERTEX_X,
+    ],
+)
+def test_geometry_fields_compare_single_multi_rank_integration(
+    processor_props: decomp_defs.ProcessProperties,
+    backend: gtx_typing.Backend | None,
+    grid_description: test_defs.GridDescription,
+    attrs_name: str,
+) -> None:
+    _compare_geometry_fields_single_multi_rank(
+        processor_props, backend, grid_description, attrs_name
+    )
+
+
 def _compare_interpolation_fields_single_multi_rank(
     processor_props: decomp_defs.ProcessProperties,
     backend: gtx_typing.Backend | None,
@@ -340,11 +371,14 @@ def _compare_interpolation_fields_single_multi_rank(
         interpolation_attributes.C_LIN_E,
         interpolation_attributes.E_BLN_C_S,
         interpolation_attributes.GEOFAC_DIV,
+        interpolation_attributes.GEOFAC_GRG_Y,
         interpolation_attributes.GEOFAC_ROT,
         interpolation_attributes.LSQ_PSEUDOINV,
         interpolation_attributes.NUDGECOEFFS_E,
         interpolation_attributes.POS_ON_TPLANE_E_X,
         interpolation_attributes.POS_ON_TPLANE_E_Y,
+        interpolation_attributes.RBF_VEC_COEFF_C2,
+        interpolation_attributes.RBF_VEC_COEFF_V2,
     ],
 )
 def test_interpolation_fields_compare_single_multi_rank_unit(
@@ -367,13 +401,10 @@ def test_interpolation_fields_compare_single_multi_rank_unit(
         interpolation_attributes.E_FLX_AVG,
         interpolation_attributes.GEOFAC_GRDIV,
         interpolation_attributes.GEOFAC_GRG_X,
-        interpolation_attributes.GEOFAC_GRG_Y,
         interpolation_attributes.GEOFAC_N2S,
         interpolation_attributes.RBF_VEC_COEFF_C1,
-        interpolation_attributes.RBF_VEC_COEFF_C2,
         interpolation_attributes.RBF_VEC_COEFF_E,
         interpolation_attributes.RBF_VEC_COEFF_V1,
-        interpolation_attributes.RBF_VEC_COEFF_V2,
     ],
 )
 def test_interpolation_fields_compare_single_multi_rank_integration(
@@ -573,10 +604,16 @@ def _compare_metrics_fields_single_multi_rank(
     "attrs_name",
     [
         metrics_attributes.CELL_HEIGHT_ON_HALF_LEVEL,
+        metrics_attributes.COEFF2_DWDZ,
         metrics_attributes.COEFF_GRADEKIN,
+        metrics_attributes.D2DEXDZ2_FAC2_MC,
         metrics_attributes.DDQZ_Z_FULL,
         metrics_attributes.DDXN_Z_HALF_E,
+        metrics_attributes.DDXT_Z_FULL,
         metrics_attributes.DDXT_Z_HALF_E,
+        metrics_attributes.DEEPATMO_DIVH,
+        metrics_attributes.DEEPATMO_DIVZL,
+        metrics_attributes.DEEPATMO_DIVZU,
         metrics_attributes.D_EXNER_DZ_REF_IC,
         metrics_attributes.EXNER_REF_MC,
         metrics_attributes.EXNER_W_IMPLICIT_WEIGHT_PARAMETER,
@@ -585,15 +622,13 @@ def _compare_metrics_fields_single_multi_rank(
         metrics_attributes.INV_DDQZ_Z_FULL,
         metrics_attributes.MAXHGTD,
         metrics_attributes.MAXSLP,
+        metrics_attributes.MAXSLP_AVG,
         metrics_attributes.MAX_NBHGT,
         metrics_attributes.PG_EXDIST_DSL,
         metrics_attributes.RAYLEIGH_W,
         metrics_attributes.RHO_REF_MC,
         metrics_attributes.RHO_REF_ME,
         metrics_attributes.SCALING_FACTOR_FOR_3D_DIVDAMP,
-        metrics_attributes.DEEPATMO_DIVH,
-        metrics_attributes.DEEPATMO_DIVZL,
-        metrics_attributes.DEEPATMO_DIVZU,
         metrics_attributes.THETA_REF_IC,
         metrics_attributes.THETA_REF_MC,
         metrics_attributes.THETA_REF_ME,
@@ -602,6 +637,7 @@ def _compare_metrics_fields_single_multi_rank(
         metrics_attributes.WGTFAC_C,
         metrics_attributes.ZDIFF_GRADP,
         metrics_attributes.ZD_DIFFCOEF,
+        metrics_attributes.ZD_VERTOFFSET,
         metrics_attributes.Z_MC,
     ],
 )
@@ -621,22 +657,17 @@ def test_metrics_fields_compare_single_multi_rank_unit(
     "attrs_name",
     [
         metrics_attributes.COEFF1_DWDZ,
-        metrics_attributes.COEFF2_DWDZ,
         metrics_attributes.D2DEXDZ2_FAC1_MC,
-        metrics_attributes.D2DEXDZ2_FAC2_MC,
         metrics_attributes.DDQZ_Z_FULL_E,
         metrics_attributes.DDQZ_Z_HALF,
         metrics_attributes.DDXN_Z_FULL,
-        metrics_attributes.DDXT_Z_FULL,
         metrics_attributes.EXNER_EXFAC,
         metrics_attributes.EXNER_W_EXPLICIT_WEIGHT_PARAMETER,
         metrics_attributes.MAXHGTD_AVG,
-        metrics_attributes.MAXSLP_AVG,
         metrics_attributes.NFLAT_GRADP,
         metrics_attributes.WGTFACQ_E,
         metrics_attributes.WGTFAC_E,
         metrics_attributes.ZD_INTCOEF,
-        metrics_attributes.ZD_VERTOFFSET,
     ],
 )
 def test_metrics_fields_compare_single_multi_rank_integration(

From 0b063cc7ccda522275ae1fada1e04a62bd09a6f9 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 20 Mar 2026 19:53:45 +0100
Subject: [PATCH 58/68] Only run integration tests in distributed CI

---
 ci/distributed.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index a0797d5f90..6548091301 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -87,11 +87,12 @@ build_distributed:
     # TODO
     # - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
     - uv pip install pytest-instafail
-    - pytest --instafail -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
+    - pytest --instafail -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT --level=$LEVEL
   parallel:
     matrix:
       - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
         BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
+        LEVEL: [integration]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'
       variables:

From eb01d9af847232d6bbed35c07af1110d2bd39490 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Mon, 23 Mar 2026 09:59:53 +0100
Subject: [PATCH 59/68] Apply suggestion from @msimberg

---
 ci/distributed.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 6548091301..e027aaa234 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -84,10 +84,7 @@ build_distributed:
     - source ${UV_PROJECT_ENVIRONMENT}/bin/activate
     - echo "running with $(python --version)"
   script:
-    # TODO
-    # - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT
-    - uv pip install pytest-instafail
-    - pytest --instafail -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT --level=$LEVEL
+    - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT --level=$LEVEL
   parallel:
     matrix:
       - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]

From fd93fb87844d2cffd640d2653aa29a13817c9ac2 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Mon, 23 Mar 2026 11:40:07 +0100
Subject: [PATCH 60/68] Test only dace_gpu/common in distributed pipeline

---
 ci/distributed.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index e027aaa234..a9f261c29d 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -87,8 +87,10 @@ build_distributed:
     - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT --level=$LEVEL
   parallel:
     matrix:
-      - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
-        BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
+      # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
+      - COMPONENT: [common]
+        # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
+        BACKEND: [dace_gpu]
         LEVEL: [integration]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'

From 702d0bdb29d7188a6e4059e55697608757f98cc6 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Mon, 23 Mar 2026 13:38:09 +0100
Subject: [PATCH 61/68] Try persistent cache and more workers on distributed CI
 pipeline

---
 ci/distributed.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index a9f261c29d..1a56ab526c 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -62,6 +62,8 @@ build_distributed:
     ICON4PY_TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
+    GT4PY_BUILD_JOBS: 32
+    GT4PY_BUILD_CACHE_LIFETIME: "persistent"
     PYTEST_ADDOPTS: "--durations=0"
     CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/cwci02/icon4py/ci/testdata:$ICON4PY_TEST_DATA_PATH"]'
     # Do not use libfabric from the host system. Libfabric with slingshot

From a7f60f08270118d686d93521284a6518a7331ba1 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Tue, 24 Mar 2026 16:30:22 +0100
Subject: [PATCH 62/68] Test only gtfn_gpu

---
 ci/distributed.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 1a56ab526c..b039cde5ca 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -92,7 +92,8 @@ build_distributed:
       # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
       - COMPONENT: [common]
         # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
-        BACKEND: [dace_gpu]
+        # BACKEND: [dace_gpu]
+        BACKEND: [gtfn_gpu]
         LEVEL: [integration]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'

From 5652ce8458a46d7bfe4d8d6ea9cec33bbfb29a1f Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 09:48:49 +0100
Subject: [PATCH 63/68] Update distributed config

---
 ci/distributed.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index b039cde5ca..d3ecfef697 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -89,11 +89,10 @@ build_distributed:
     - ci/scripts/ci-mpi-wrapper.sh pytest -sv -k mpi_tests --with-mpi --backend=$BACKEND model/$COMPONENT --level=$LEVEL
   parallel:
     matrix:
-      # - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
-      - COMPONENT: [common]
-        # BACKEND: [embedded, gtfn_cpu, dace_cpu, dace_gpu, gtfn_gpu]
-        # BACKEND: [dace_gpu]
-        BACKEND: [gtfn_gpu]
+      - COMPONENT: [atmosphere/diffusion, atmosphere/dycore, common]
+        # TODO(msimberg): Enable dace_gpu when compilation doesn't take as long
+        # or when we can cache across CI jobs.
+        BACKEND: [embedded, gtfn_cpu, dace_cpu, gtfn_gpu]
         LEVEL: [integration]
   rules:
     - if: $COMPONENT == 'atmosphere/diffusion'
@@ -104,7 +103,7 @@ build_distributed:
         SLURM_TIMELIMIT: '00:30:00'
     - if: $COMPONENT == 'common' && ($BACKEND == 'dace_gpu' || $BACKEND == 'gtfn_gpu')
       variables:
-        # TODO(msimberg): This is very long, can we do better?
+        # TODO(msimberg): Decrease this when enabling dace_gpu above, if possible.
         SLURM_TIMELIMIT: '03:00:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:

From 5ba050c0798136cfe6344d6fd887b6e4e1200be2 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 14:28:55 +0100
Subject: [PATCH 64/68] Upgrade mpi4py

---
 uv.lock | 58 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/uv.lock b/uv.lock
index 8bae6ba865..0c21145096 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2523,15 +2523,55 @@ wheels = [
 
 [[package]]
 name = "mpi4py"
-version = "4.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/08/34/8499a92a387d24d0092c38089f8195f13c5c76f0f814126af3fe363e5636/mpi4py-4.0.1.tar.gz", hash = "sha256:f3174b245775d556f4fddb32519a2066ef0592edc810c5b5a59238f9a0a40c89", size = 466179, upload-time = "2024-10-11T10:59:53.425Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/22/15/7d2fd2ca8b1ae362371b2bb9b2f787f9166b6ecd536e0e773dce6b98a5a9/mpi4py-4.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:600f26cae7f390b4ec525f5c1ccc374686c37a8c07f9c21320866c0a323f6dae", size = 1588594, upload-time = "2024-10-12T07:10:26.736Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/f7/6dfdee53f9806361ab75cb83ee5feab06a738f7f6a42715c79d72a783d31/mpi4py-4.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cb209fcdc7fee0346d12edff1cfd1c1ffca1b807c53631ba0436b9c2bcf8229", size = 1599377, upload-time = "2024-10-12T07:10:30.836Z" },
-    { url = "https://files.pythonhosted.org/packages/35/28/7e5eae1a9940f48c41e208e9e6fdb56e497095030ab53e2d9ce702705cbb/mpi4py-4.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:b704e7db92b1ac94b96802e17cf28082455daa928e8e51398ad9f5e5eb8c9b7b", size = 1727556, upload-time = "2024-10-12T07:10:36.005Z" },
-    { url = "https://files.pythonhosted.org/packages/95/70/cc361869a2920476ecc5f29c98e0130aaf2e177a0087cb7ebbafb90414f1/mpi4py-4.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:52a7b1760b1aeb41a0ea38969314b2b170117a0ded2f689915f1cb89aaaf8a6f", size = 1726170, upload-time = "2024-10-12T07:10:39.15Z" },
-    { url = "https://files.pythonhosted.org/packages/17/23/81aed5da44f9d743f1e76909fd04ae5dc122ff7c9f97fa0b40b8f752245c/mpi4py-4.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:93f45dcc2fd5f3396f961b1bc8f0fb9d5db786fdc0d72e4f8611f47718b5dac8", size = 1584997, upload-time = "2024-10-12T07:10:52.704Z" },
+version = "4.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/62/74/28ea85b0b949cad827ea50720e00e814e88c8fd536c27c3c491e4f025724/mpi4py-4.1.1.tar.gz", hash = "sha256:eb2c8489bdbc47fdc6b26ca7576e927a11b070b6de196a443132766b3d0a2a22", size = 500518, upload-time = "2025-10-10T13:55:20.402Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/b3/2e7df40608f2188dca16e38f8030add1071f06b1cd94dd8a4e16b9acbd84/mpi4py-4.1.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:1586f5d1557abed9cba7e984d18f32e787b353be0986e599974db177ae36329a", size = 1422849, upload-time = "2025-10-10T13:53:40.082Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/ed/970bd3edc0e614eccc726fa406255b88f728a8bc059e81f96f28d6ede0af/mpi4py-4.1.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ba85e4778d63c750226de95115c92b709f38d7e661be660a275da4f0992ee197", size = 1326982, upload-time = "2025-10-10T13:53:42.32Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/c3/f9a5d1f9ba52ac6386bf3d3550027f42a6b102b0432113cc43294420feb2/mpi4py-4.1.1-cp310-abi3-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0a8332884626994d9ef48da233dc7a0355f4868dd7ff59f078d5813a2935b930", size = 1373127, upload-time = "2025-10-10T13:53:43.957Z" },
+    { url = "https://files.pythonhosted.org/packages/84/d1/1fe75025df801d817ed49371c719559f742f3f263323442d34dbe3366af3/mpi4py-4.1.1-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6e0352860f0b3e18bc0dcb47e42e583ccb9472f89752d711a6fca46a38670554", size = 1225134, upload-time = "2025-10-10T13:53:45.583Z" },
+    { url = "https://files.pythonhosted.org/packages/40/44/d653fec0e4ca8181645da4bfb2763017625e5b3f151b208fadd932cb1766/mpi4py-4.1.1-cp310-abi3-win_amd64.whl", hash = "sha256:0f46dfe666a599e4bd2641116b2b4852a3ed9d37915edf98fae471d666663128", size = 1478863, upload-time = "2025-10-10T13:53:47.178Z" },
+    { url = "https://files.pythonhosted.org/packages/58/f7/793c9a532e5367cffb2b97ca6a879285ca73a14f79e6ff208bb390651a43/mpi4py-4.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9082e04c8afcffa7d650a262d800af1a617c555d610810deeab265a4a5f7d42e", size = 1585904, upload-time = "2025-10-10T13:53:49.129Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/fe/cdead6721426b25d817a1bf45d5adc6dc90fd8bb0831f5ca06a4edd2015c/mpi4py-4.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d618e6a5a8f6f86c33a954356d8ed398bec31f34b63321570661ac157063bb6", size = 1438343, upload-time = "2025-10-10T13:53:51.098Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/c4/4a73c80cf483df603770278f0fdc57da5394edee376790c62f1eba04bb3b/mpi4py-4.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d4c460609bd6decc22ad89cbfe48e4c5a2461ff52ada9345a4c19edee39f93da", size = 1432321, upload-time = "2025-10-10T13:53:53.235Z" },
+    { url = "https://files.pythonhosted.org/packages/49/56/7b32631f3cc5cf741610a108a7f40a3714c9862c1f637b5ded525af32be9/mpi4py-4.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c04a388c7a945e751c82742c6bb277434d26a67768a01952f7494d1c25dff94b", size = 1299883, upload-time = "2025-10-10T13:53:55.22Z" },
+    { url = "https://files.pythonhosted.org/packages/14/76/53caf807ec74c042fbecf76162e071c09c53fb0ed66b1edf31dabd64c588/mpi4py-4.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:1ad4b225a5a1a02a2b89979ed8f328c6a2bc3bd6ad4a57e453727f90373fa5f8", size = 1622884, upload-time = "2025-10-10T13:53:56.882Z" },
+    { url = "https://files.pythonhosted.org/packages/20/8f/5d28174048ef02fb91dd0759a32c07b272c9f1df265e19145712aa7bd712/mpi4py-4.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a428ba96b992a8911cf932fa71dd8c0260d47ab7e5dee2b09239ad91fc540b79", size = 1596913, upload-time = "2025-10-10T13:53:58.466Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/81/dce928b11816fac9713e93e609476ddac520fc50368aa7591728c329ff19/mpi4py-4.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fc0cf81445fac2ae2e5716c365fd72e1bb545df065f5a3f6731f64b3beed886e", size = 1433274, upload-time = "2025-10-10T13:54:00.508Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/15/1a869a35d3e3438866dc8d8c9cb04dc6aa484171343627a8baf82c3c1ca9/mpi4py-4.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a753d5d61b46f90260247f344a6c57c527a6a4e7bea126830120ab41c3d057e5", size = 1423333, upload-time = "2025-10-10T13:54:03.679Z" },
+    { url = "https://files.pythonhosted.org/packages/25/33/072781fb85f5bc50b93ee7e8d3b3afb849d50570431b6cb2aa957db79b59/mpi4py-4.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4a36ef9d7b2b6b62026dbf9b59b44efb5430f7b9ca5fb855bfbf8d403218e37c", size = 1299183, upload-time = "2025-10-10T13:54:05.3Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/a7/152af3c6412702a4e0fcfd0fe572307ed52821de13db9c96535f31a39aa7/mpi4py-4.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:20bf4c0c65fd67287664f8b1b6dc7c7b341838f10bba34a2e452d47530ce8a5f", size = 1632284, upload-time = "2025-10-10T13:54:06.786Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/2c/e201cd4828555f10306a5439875cbd0ecfba766ace01ff5c6df43f795650/mpi4py-4.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4403a7cec985be9963efc626193e6df3f63f5ada0c26373c28e640e623e56c3", size = 1669517, upload-time = "2025-10-10T13:54:08.404Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/53/18d978c3a19deecf38217ce54319e6c9162fec3569c4256c039b66eac2f4/mpi4py-4.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a2ffccc9f3a8c7c957403faad594d650c60234ac08cbedf45beaa96602debe9", size = 1454721, upload-time = "2025-10-10T13:54:09.977Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/15/b908d1d23a4bd2bd7b2e98de5df23b26e43145119fe294728bf89211b935/mpi4py-4.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ed3d9b619bf197a290f7fd67eb61b1c2a5c204afd9621651a50dc0b1c1280d45", size = 1448977, upload-time = "2025-10-10T13:54:11.65Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/19/088a2d37e80e0feb7851853b2a71cbe6f9b18bdf0eab680977864ea83aab/mpi4py-4.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0699c194db5d95fc2085711e4e0013083bd7ae9a88438e1fd64ddb67e9b0cf9e", size = 1318737, upload-time = "2025-10-10T13:54:13.075Z" },
+    { url = "https://files.pythonhosted.org/packages/97/3a/526261f39bf096e5ff396d18b76740a58d872425612ff84113dd85c2c08e/mpi4py-4.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:0abf5490c3d49c30542b461bfc5ad88dd7d147a4bdb456b7163640577fdfef88", size = 1725676, upload-time = "2025-10-10T13:54:14.681Z" },
+    { url = "https://files.pythonhosted.org/packages/30/75/2ffccd69360680a0216e71f90fd50dc8ff49711be54502d522a068196c68/mpi4py-4.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3dd973c509f2dbb6904c035a4a071509cde98decf0528fa21e2e7d5db5cc988", size = 1710002, upload-time = "2025-10-10T13:54:17.042Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/13/22fa9dcbc5e4ae6fd10cba6d49b7c879c30c5bea88f450f79b373d200f40/mpi4py-4.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c8c83a359e62dd7fdd030360f430e0e8986df029c0953ab216ff97a110038dc4", size = 1484623, upload-time = "2025-10-10T13:54:19.097Z" },
+    { url = "https://files.pythonhosted.org/packages/47/01/476f0f9dc96261d02214009f42e10338fc56f260f1f10b23ee89c515c8b7/mpi4py-4.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:323ba354ba951c7736c033c5f2ad07bb1276f9696f0312ea6ff0a28cd0ab3e3d", size = 1448403, upload-time = "2025-10-10T13:54:21.211Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/20/dc990edb7b075ecdba4e02bcd03d1583faeb84f664d1585c4c00a0f9851a/mpi4py-4.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c4ef9fe5fb211b1c5b6afe521397e3feb01e104024d6bc37aa4289c370605e2", size = 1318018, upload-time = "2025-10-10T13:54:23.23Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/bf/b0ab43a99ac2a1d6d5765cb7d2a4f093656090ce07528043057ecc3e87cb/mpi4py-4.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:e13a1ba26604514a12c95b7d76058ce800d5740d5f5f3b50c4b782cfa0dfaa1f", size = 1722939, upload-time = "2025-10-10T13:54:24.862Z" },
+    { url = "https://files.pythonhosted.org/packages/84/26/3e00dc536311e758096414b4f33beb4c7f04dff875e87a6e88fbbe4fc2d8/mpi4py-4.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:28ce1f7412f5e99a6b9fe2547203633431d0ee45670413a475a07e6c785e63b1", size = 1798116, upload-time = "2025-10-10T13:54:26.378Z" },
+    { url = "https://files.pythonhosted.org/packages/15/51/d06d2b126be5660aca8c00fe0d940a8658085038f61a9cfc834d3d5ffa80/mpi4py-4.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd1e49b84a0651018517e87daf68085719eca25e5c9a7cd05d98a73418c88836", size = 1586285, upload-time = "2025-10-10T13:54:27.838Z" },
+    { url = "https://files.pythonhosted.org/packages/51/63/eeb936e0e8cfd8160b6b297645c730b22d242595861cf6a2fa627a358175/mpi4py-4.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:dd869ea7758b591ffbb1483588a6fbf84952a5090e80a45ea89674d55cf25f3b", size = 1514102, upload-time = "2025-10-10T13:54:29.297Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/c1/06967d4c107ea7169d2120c4fb86c404707e6de82e277dc9f0fa5a9c1bf1/mpi4py-4.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:475da0797442cba723c0ad37da6a1c51d9624e697dd8bf89f23d0fad81e73eda", size = 1395247, upload-time = "2025-10-10T13:54:30.881Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/7c/5f0f32b39185f0a7074c165dc37cdd235bfd737928a2fe223e41b308fb4c/mpi4py-4.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:8d3bfa074776d9507ee957f5230d11ecd03da23f601a85349a1a333eaf55e5fa", size = 1771515, upload-time = "2025-10-10T13:54:32.395Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/e8/93ddde2b6ee7631b46bb79b851630b3527d9060b9b999844bcd882977539/mpi4py-4.1.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:1deb6f9df28ec6972305287cb2035c20d3f5af59f687f962080756374c16e48f", size = 1713353, upload-time = "2025-10-10T13:54:33.934Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/23/449562bd23fcfbd7d01006b39429972bfed5dfb8541355d06d2e17c16c27/mpi4py-4.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1bb1e3ad0b9047b0dbc7b4014160a7ab2a84f1627be665527c7445fc312f189b", size = 1496415, upload-time = "2025-10-10T13:54:35.927Z" },
+    { url = "https://files.pythonhosted.org/packages/51/33/9a5b9ae66cbb095b711f4ddae6d2d4b0f55202ac9e503fd588b101f04a22/mpi4py-4.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5f757e3089abf2c9db69fac1665fa99c52ed392fdf799159f25cba9ee3b64f5a", size = 1450750, upload-time = "2025-10-10T13:54:37.608Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/88/6acf948f19cb59c0e8843fed4ab4c471b7644e8a16c2d5d9c7ab6d73d573/mpi4py-4.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:807c6f1ed3adbc12952db52127e34cfbd6c48a05c3b3dd59deee2d2f09d78888", size = 1325773, upload-time = "2025-10-10T13:54:39.136Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/b4/3021e073772cd9e1062a810b7298e68ea40933fb91b1c1c0d07c968dce5c/mpi4py-4.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:2c85983d38d77e6302a242e32afd2a9a9b3adedd770e199a38e5b8957150e7ac", size = 1721603, upload-time = "2025-10-10T13:54:41.396Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/02/b6700c24fe28588a4e40adb23d02fe2aea82b33495fd6290235da5199383/mpi4py-4.1.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:729c4f625ad60e5cfb6c260608d249dc35a33cc16605faff01c6adbbd7e8ce0f", size = 1799551, upload-time = "2025-10-10T13:54:43.084Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/93/9c9870174183869bd5a50bbfe7bda91a52bf7ca2d0851de4009590e735a2/mpi4py-4.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3cca235d46009f54cb319c779c6ac53d41ce1eee3cf07f157995bc7739329b97", size = 1587583, upload-time = "2025-10-10T13:54:45.989Z" },
+    { url = "https://files.pythonhosted.org/packages/29/12/c46bec2311fc937ed3767312f9feb5f11bc70058c20bc53ae7369d759424/mpi4py-4.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2580fab891db492f32a6e02717e824f6fd5588be6560b08627c1e9322f7ccbfb", size = 1513437, upload-time = "2025-10-10T13:54:48.145Z" },
+    { url = "https://files.pythonhosted.org/packages/09/3e/e46629867204b22ce6804096e0b7d35bb5b473df1d12272021843af726c3/mpi4py-4.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6beec4841f9436d49ec9cabfd76a19df61c10b21ca14eddafa58fe7977802ee7", size = 1395082, upload-time = "2025-10-10T13:54:49.744Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/ca/7e27edf78cd8ba68aacafc836004cd092a978f0d5ffc8a3eac9e904a3e0e/mpi4py-4.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:b4b3813da9a7a1fc37ffb8dad314cb396313a40cd3fe150854ab29e999a9eb8c", size = 1771707, upload-time = "2025-10-10T13:54:51.756Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/63/b6a2863fb7dd5a9eccfdb055bf1124b999ff755d0187223b307161479b76/mpi4py-4.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:95bb98d946eb88c9ae4dc6c42d11b3af8ce6b91e644c288cc3f85ec7596ffcd3", size = 1480110, upload-time = "2025-10-10T13:55:11.381Z" },
+    { url = "https://files.pythonhosted.org/packages/de/18/358f0eb58fb3b79f65861ed682af9e735d86669663dfbce396e8673ed518/mpi4py-4.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:84e9eb2e609b0b94cd0e9a3e3b57d897f748fb0207c4f72e81e5a95aba033767", size = 1340704, upload-time = "2025-10-10T13:55:12.973Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/66/b342e330ac543d0147ebfab754f69854c4777ac9785cb5b7610e3cd0c29a/mpi4py-4.1.1-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:027b1a1ff9d57afed10af6b79041b95f85fd11b2af74e4c34ef4866ce81ecc24", size = 1380452, upload-time = "2025-10-10T13:55:14.582Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/61/bbf87de6f3a8a9c54e7a4b72878c9069646ca9cafac8217fa5493a54b068/mpi4py-4.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c1191856906967a48fdcc484b326c179747e68c186261d76480a75156bcc73bf", size = 1255980, upload-time = "2025-10-10T13:55:17.075Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/4b/227091dec11518e5545bd1ec91f52e06f64bdae697adc5fb33f9f20c04dc/mpi4py-4.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:189d49b0ae963f8f6f5dd8ed0f5f37923285c97bc725476990ec0556972bb4b2", size = 1452641, upload-time = "2025-10-10T13:55:18.562Z" },
 ]
 
 [[package]]

From 3a37c9c0dc093b14bf084aeeabb4238491e590cd Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 14:29:13 +0100
Subject: [PATCH 65/68] Remove explicitl GT4PY_BUILD_JOBS from distributed
 pipeline

---
 ci/distributed.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index d3ecfef697..b62447e15d 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -62,7 +62,6 @@ build_distributed:
     ICON4PY_TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false
-    GT4PY_BUILD_JOBS: 32
     GT4PY_BUILD_CACHE_LIFETIME: "persistent"
     PYTEST_ADDOPTS: "--durations=0"
     CSCS_ADDITIONAL_MOUNTS: '["/capstor/store/cscs/userlab/cwci02/icon4py/ci/testdata:$ICON4PY_TEST_DATA_PATH"]'

From 99811d07b67a359adff52fdc77514e2e6c728ad4 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 15:31:21 +0100
Subject: [PATCH 66/68] Decrease distributed gpu timelimit

---
 ci/distributed.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index bce42b243d..f5e726ea99 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -107,7 +107,7 @@ build_distributed:
     - if: $COMPONENT == 'common' && ($BACKEND == 'dace_gpu' || $BACKEND == 'gtfn_gpu')
       variables:
         # TODO(msimberg): Decrease this when enabling dace_gpu above, if possible.
-        SLURM_TIMELIMIT: '03:00:00'
+        SLURM_TIMELIMIT: '01:30:00'
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From d6dcc6c44cf56206ccda6daa4876b1e2f4c8555a Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 15:57:45 +0100
Subject: [PATCH 67/68] Use normal partition for long distributed CI jobs

---
 ci/distributed.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index f5e726ea99..28faf710d8 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -108,6 +108,9 @@ build_distributed:
       variables:
         # TODO(msimberg): Decrease this when enabling dace_gpu above, if possible.
         SLURM_TIMELIMIT: '01:30:00'
+        # TODO(msimberg): Use shared partition when time limit can be set to at
+        # most an hour. The shared partition only accepts jobs maximum an hour long.
+        SLURM_PARTITION: "normal"
     - if: $COMPONENT == 'atmosphere/dycore'
       variables:
         SLURM_TIMELIMIT: '00:15:00'

From fabff2f9bfe673215ac793437f5627ba414215aa Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 25 Mar 2026 15:59:04 +0100
Subject: [PATCH 68/68] Remove gpus per task entry from distributed ci
 configuration

---
 ci/distributed.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/distributed.yml b/ci/distributed.yml
index 28faf710d8..50c7f85444 100644
--- a/ci/distributed.yml
+++ b/ci/distributed.yml
@@ -61,7 +61,6 @@ build_distributed:
     SLURM_PARTITION: "shared"
     SLURM_CPU_BIND: 'verbose'
     SLURM_NTASKS: 4
-    SLURM_GPUS_PER_TASK: 1
     ICON4PY_TEST_DATA_PATH: "/icon4py/testdata"
     ICON4PY_ENABLE_GRID_DOWNLOAD: false
     ICON4PY_ENABLE_TESTDATA_DOWNLOAD: false