LongLeCE · pull · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget xz-utils
 
 # Install SSL and Vulkan SDK dependencies
 RUN apt install -y libssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers
 
 # Build it
 WORKDIR /app

diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
@@ -141,61 +141,59 @@ jobs:
   #         amd-smi static
   #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  # TODO: sandbox Mac runners
-  #  ggml-ci-mac-metal:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-webgpu:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Dawn Dependency
-  #        id: dawn-depends
-  #        run: |
-  #          DAWN_VERSION="v2.0.0"
-  #          DAWN_OWNER="reeselevine"
-  #          DAWN_REPO="dawn"
-  #          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
-  #          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          curl -L -o artifact.zip \
-  #            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          mkdir dawn
-  #          unzip artifact.zip
-  #          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-vulkan:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          vulkaninfo --summary
-  #          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  ggml-ci-mac-metal:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-mac-webgpu:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-mac-vulkan:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-linux-intel-vulkan:
     runs-on: [self-hosted, Linux, Intel]

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -318,7 +318,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
+          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
           echo "CC=gcc-14" >> "$GITHUB_ENV"
           echo "CXX=g++-14" >> "$GITHUB_ENV"
 

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -202,7 +202,7 @@ jobs:
             sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
           else
             sudo apt-get update -y
-            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
+            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
             echo "CC=gcc-14" >> "$GITHUB_ENV"
             echo "CXX=g++-14" >> "$GITHUB_ENV"
           fi

diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
@@ -84,41 +84,42 @@ jobs:
           export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
-  server-cuda:
-    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-
-    name: server-cuda (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        build_type: [Release]
-        wf_name: ["GPUx1"]
-        include:
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx1, backend-sampling"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
+  # TODO: provision CUDA runner
+  #  server-cuda:
+  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+  #
+  #    name: server-cuda (${{ matrix.wf_name }})
+  #    strategy:
+  #      matrix:
+  #        build_type: [Release]
+  #        wf_name: ["GPUx1"]
+  #        include:
+  #          - build_type: Release
+  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+  #            wf_name:    "GPUx1, backend-sampling"
+  #      fail-fast: false
+  #
+  #    steps:
+  #      - name: Clone
+  #        id: checkout
+  #        uses: actions/checkout@v6
+  #        with:
+  #          fetch-depth: 0
+  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+  #
+  #      - name: Build
+  #        id: cmake_build
+  #        run: |
+  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
+  #
+  #      - name: Tests
+  #        id: server_integration_tests
+  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+  #        run: |
+  #          cd tools/server/tests
+  #          python3 -m venv venv
+  #          source venv/bin/activate
+  #          pip install -r requirements.txt
+  #          export ${{ matrix.extra_args }}
+  #          pytest -v -x -m "not slow"
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -287,8 +287,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
         }
     }
 
-    // reasoning budget sampler
-    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
+    // reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
+    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
         rbudget = common_reasoning_budget_init(
             vocab,
             params.reasoning_budget_start,

diff --git a/docs/build.md b/docs/build.md
@@ -456,7 +456,8 @@ pacman -S git \
     mingw-w64-ucrt-x86_64-gcc \
     mingw-w64-ucrt-x86_64-cmake \
     mingw-w64-ucrt-x86_64-vulkan-devel \
-    mingw-w64-ucrt-x86_64-shaderc
+    mingw-w64-ucrt-x86_64-shaderc \
+    mingw-w64-ucrt-x86_64-spirv-headers
 ```
 
 Switch into the `llama.cpp` directory and build using CMake.
@@ -490,9 +491,11 @@ First, follow the official LunarG instructions for the installation and setup of
 
 On Debian / Ubuntu, you can install the required dependencies using:
 ```sh
-sudo apt-get install libvulkan-dev glslc
+sudo apt-get install libvulkan-dev glslc spirv-headers
 ```
 
+SPIRV-Headers (`spirv/unified1/spirv.hpp`) are required for the Vulkan backend and are **not** always pulled in by the Vulkan loader dev package alone. Other distros use names such as `spirv-headers` (Ubuntu / Debian / Arch), or `spirv-headers-devel` (Fedora / openSUSE). On Windows, the LunarG Vulkan SDK’s `Include` directory already contains these headers.
+
 #### Common steps
 
 Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -1,4 +1,11 @@
 cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
+
+# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
+# MSVC is not a valid assembler for the ASM language.
+# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
+if (POLICY CMP0194)
+    cmake_policy(SET CMP0194 NEW)
+endif()
 project("ggml" C CXX ASM)
 
 ### GGML Version

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -348,6 +348,53 @@ extern "C" {
     // Set a callback to be called for each resulting node during graph compute
     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 
+    //
+    // Meta backend
+    //
+
+#define GGML_BACKEND_META_MAX_DEVICES 16
+
+    enum ggml_backend_meta_split_axis {
+        // tensor split by tensor dimensions:
+        GGML_BACKEND_SPLIT_AXIS_0 = 0,
+        GGML_BACKEND_SPLIT_AXIS_1 = 1,
+        GGML_BACKEND_SPLIT_AXIS_2 = 2,
+        GGML_BACKEND_SPLIT_AXIS_3 = 3,
+
+        GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+        GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+
+        // for internal bookkeeping only:
+        GGML_BACKEND_SPLIT_AXIS_NONE    = 98,
+        GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
+    };
+    GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
+
+    struct ggml_backend_meta_split_state {
+        enum ggml_backend_meta_split_axis axis;
+
+        // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
+        //   - each device has a slice of the tensor along the split axis
+        //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
+        //   - some tensors have an inhomogenenous data layout along the split axis,
+        //     those tensors are divided into segments which are each individually split across devices
+        //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
+        //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
+        //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
+        //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
+        int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
+        uint32_t n_segments;
+    };
+
+    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+    typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+    // TODO: this looks a bit strange - a backend API creates a device. I think we should try
+    //       express this as a backend registry functionality instead
+    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
     //
     // Utils
     //

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -2,6 +2,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml.h"
 #include "ggml-impl.h"
+
 #include <assert.h>
 #include <limits.h>
 #include <stdarg.h>

diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
@@ -5,9 +5,6 @@
 #include "ggml-alloc.h"
 #include "ggml-cpp.h"
 
-// TODO: tmp
-#include "ggml-ext.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>