Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devops/vulkan.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget xz-utils

# Install SSL and Vulkan SDK dependencies
RUN apt install -y libssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers

# Build it
WORKDIR /app
Expand Down
108 changes: 53 additions & 55 deletions .github/workflows/build-self-hosted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,61 +141,59 @@ jobs:
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

# TODO: sandbox Mac runners
# ggml-ci-mac-metal:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Test
# id: ggml-ci
# run: |
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
#
# ggml-ci-mac-webgpu:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Dawn Dependency
# id: dawn-depends
# run: |
# DAWN_VERSION="v2.0.0"
# DAWN_OWNER="reeselevine"
# DAWN_REPO="dawn"
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
# curl -L -o artifact.zip \
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
# mkdir dawn
# unzip artifact.zip
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
#
# - name: Test
# id: ggml-ci
# run: |
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
#
# ggml-ci-mac-vulkan:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

ggml-ci-mac-webgpu:
runs-on: [self-hosted, macOS, ARM64]

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1

- name: Test
id: ggml-ci
run: |
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

ggml-ci-linux-intel-vulkan:
runs-on: [self-hosted, Linux, Intel]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ jobs:
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
else
sudo apt-get update -y
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
fi
Expand Down
77 changes: 39 additions & 38 deletions .github/workflows/server-self-hosted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,41 +84,42 @@ jobs:
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"

server-cuda:
runs-on: [self-hosted, llama-server, Linux, NVIDIA]

name: server-cuda (${{ matrix.wf_name }})
strategy:
matrix:
build_type: [Release]
wf_name: ["GPUx1"]
include:
- build_type: Release
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx1, backend-sampling"
fail-fast: false

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

- name: Build
id: cmake_build
run: |
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server

- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
# TODO: provision CUDA runner
# server-cuda:
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
#
# name: server-cuda (${{ matrix.wf_name }})
# strategy:
# matrix:
# build_type: [Release]
# wf_name: ["GPUx1"]
# include:
# - build_type: Release
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
# wf_name: "GPUx1, backend-sampling"
# fail-fast: false
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# with:
# fetch-depth: 0
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
#
# - name: Build
# id: cmake_build
# run: |
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
#
# - name: Tests
# id: server_integration_tests
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
# run: |
# cd tools/server/tests
# python3 -m venv venv
# source venv/bin/activate
# pip install -r requirements.txt
# export ${{ matrix.extra_args }}
# pytest -v -x -m "not slow"
4 changes: 2 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}

// reasoning budget sampler
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
// reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
rbudget = common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
Expand Down
7 changes: 5 additions & 2 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,8 @@ pacman -S git \
mingw-w64-ucrt-x86_64-gcc \
mingw-w64-ucrt-x86_64-cmake \
mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc
mingw-w64-ucrt-x86_64-shaderc \
mingw-w64-ucrt-x86_64-spirv-headers
```

Switch into the `llama.cpp` directory and build using CMake.
Expand Down Expand Up @@ -490,9 +491,11 @@ First, follow the official LunarG instructions for the installation and setup of

On Debian / Ubuntu, you can install the required dependencies using:
```sh
sudo apt-get install libvulkan-dev glslc
sudo apt-get install libvulkan-dev glslc spirv-headers
```

SPIRV-Headers (`spirv/unified1/spirv.hpp`) are required for the Vulkan backend and are **not** always pulled in by the Vulkan loader dev package alone. Other distros use names such as `spirv-headers` (Ubuntu / Debian / Arch), or `spirv-headers-devel` (Fedora / openSUSE). On Windows, the LunarG Vulkan SDK’s `Include` directory already contains these headers.

#### Common steps

Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
Expand Down
7 changes: 7 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.

# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
# MSVC is not a valid assembler for the ASM language.
# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
if (POLICY CMP0194)
cmake_policy(SET CMP0194 NEW)
endif()
project("ggml" C CXX ASM)

### GGML Version
Expand Down
47 changes: 47 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,53 @@ extern "C" {
// Set a callback to be called for each resulting node during graph compute
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);

//
// Meta backend
//

#define GGML_BACKEND_META_MAX_DEVICES 16

enum ggml_backend_meta_split_axis {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_AXIS_0 = 0,
GGML_BACKEND_SPLIT_AXIS_1 = 1,
GGML_BACKEND_SPLIT_AXIS_2 = 2,
GGML_BACKEND_SPLIT_AXIS_3 = 3,

GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum

// for internal bookkeeping only:
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
};
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);

struct ggml_backend_meta_split_state {
enum ggml_backend_meta_split_axis axis;

// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
// - each device has a slice of the tensor along the split axis
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
// - some tensors have an inhomogenenous data layout along the split axis,
// those tensors are divided into segments which are each individually split across devices
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
uint32_t n_segments;
};

// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);

// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
// express this as a backend registry functionality instead
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);

//
// Utils
//
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "ggml-backend-impl.h"
#include "ggml.h"
#include "ggml-impl.h"

#include <assert.h>
#include <limits.h>
#include <stdarg.h>
Expand Down
3 changes: 0 additions & 3 deletions ggml/src/ggml-backend-meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
#include "ggml-alloc.h"
#include "ggml-cpp.h"

// TODO: tmp
#include "ggml-ext.h"

#include <algorithm>
#include <cassert>
#include <cmath>
Expand Down
Loading
Loading