From 549b9d84330c327e6791fa812a7d60c0cf63572e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 24 May 2026 18:20:10 +0300
Subject: [PATCH] ci : update build-self-hosted.yml (#23616)

---
 .github/workflows/build-self-hosted.yml  |  86 ++++++++++++++++---
 .github/workflows/build.yml              | 102 ++++++++++++-----------
 .github/workflows/server-self-hosted.yml |  77 +++++++++--------
 ci/run.sh                                |   2 +-
 4 files changed, 167 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index 2851c45601f..c247222eb4b 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -57,7 +57,7 @@ env:
 jobs:
   determine-tag:
     name: Determine tag name
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
     steps:
@@ -86,7 +86,7 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm:
     needs: determine-tag
@@ -103,7 +103,7 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm2:
     needs: determine-tag
@@ -120,10 +120,11 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-webgpu:
-    runs-on: [self-hosted, Linux, NVIDIA]
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, NVIDIA, X64]
 
     steps:
       - name: Clone
@@ -149,10 +150,11 @@ jobs:
           GG_BUILD_WEBGPU=1 \
           GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
           GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMX-compatible machine
   #ggml-ci-cpu-amx:
+  #  needs: determine-tag
   #  runs-on: [self-hosted, Linux, CPU, AMX]
 
   #  steps:
@@ -163,10 +165,11 @@ jobs:
   #    - name: Test
   #      id: ggml-ci
   #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-vulkan:
+  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -178,10 +181,11 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-rocm:
+  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -193,7 +197,7 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-mac-metal:
     needs: determine-tag
@@ -337,4 +341,66 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-low-perf:
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-high-perf:
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
+#         CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
+#           ARM -march/-mcpu not found, -mcpu=native will be used
+#
+#       if we resolve this, we should be able to offload these jobs to the self-hosted runners
+#
+#  ggml-ci-arm64-cpu-high-perf-sve:
+#    needs: determine-tag
+#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+#
+#  ggml-ci-arm64-cpu-kleidiai:
+#    needs: determine-tag
+#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 65fa24f4468..47b377ff72b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -931,31 +931,32 @@ jobs:
         run: |
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-low-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# note: moved to build-self-hosted.yml - can remove from here when everything is stable
+#  ggml-ci-arm64-cpu-low-perf:
+#    runs-on: ubuntu-22.04-arm
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ggml-ci-arm64-cpu-low-perf
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-x64-cpu-high-perf:
     runs-on: ubuntu-22.04
@@ -983,31 +984,32 @@ jobs:
         run: |
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-high-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# note: moved to build-self-hosted.yml - can remove from here when everything is stable
+#  ggml-ci-arm64-cpu-high-perf:
+#    runs-on: ubuntu-22.04-arm
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ggml-ci-arm64-cpu-high-perf
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-arm64-cpu-high-perf-sve:
     runs-on: ubuntu-22.04-arm
diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index 857c72a4619..91e0653943c 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -91,45 +91,44 @@ jobs:
           export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
-  # TODO: provision CUDA runner
-  #  server-cuda:
-  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-  #
-  #    name: server-cuda (${{ matrix.wf_name }})
-  #    strategy:
-  #      matrix:
-  #        build_type: [Release]
-  #        wf_name: ["GPUx1"]
-  #        include:
-  #          - build_type: Release
-  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-  #            wf_name:    "GPUx1, backend-sampling"
-  #      fail-fast: false
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #        with:
-  #          fetch-depth: 0
-  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-  #
-  #      - name: Build
-  #        id: cmake_build
-  #        run: |
-  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-  #
-  #      - name: Tests
-  #        id: server_integration_tests
-  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-  #        run: |
-  #          cd tools/server/tests
-  #          python3 -m venv venv
-  #          source venv/bin/activate
-  #          pip install -r requirements.txt
-  #          export ${{ matrix.extra_args }}
-  #          pytest -v -x -m "not slow"
+  server-cuda:
+    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+
+    name: server-cuda (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        build_type: [Release]
+        wf_name: ["GPUx1"]
+        include:
+          - build_type: Release
+            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx1, backend-sampling"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"
 
   server-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x
diff --git a/ci/run.sh b/ci/run.sh
index b096dc23b66..4acf4375267 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -238,7 +238,7 @@ function gg_run_ctest_debug {
     (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }