From a1a69f777a14bb8584ba0eb53505cd5ee888bd5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 13:34:08 +0300 Subject: [PATCH 1/3] metal : optimize concat kernel and fix set kernel threads (#23411) * metal : fix GGML_OP_SET kernel threads * tests : extend test_cpy to support different src/dst shapes Extend test_cpy to support different source and destination tensor shapes for CPY operations (reshaping), where the total number of elements must match. - Renamed ne -> ne_src, added ne_dst parameter (default: use src shape) - Added 50 new reshaping test cases covering 1D<->2D<->3D<->4D conversions - Tests exercise 1024 boundary, small shapes, and large dimensionality changes - Fixed dangling reference bug (storing & to temporary std::array) - Updated all existing test calls with permute/transpose args for compatibility Assisted-by: llama.cpp:local pi * metal : optimize concat kernel with row batching for small widths When ne0 < 256, batch multiple rows into a single threadgroup to improve occupancy. This avoids underutilizing the GPU when processing narrow tensors. - Dispatch nth = min(256, ne0) threads per group - Calculate nrptg (rows per threadgroup) to fill up to 256 threads - Update kernel index calculation to handle the row batching - Add boundary check for i1 >= ne1 Assisted-by: llama.cpp:local pi * tests : clean-up * tests : refactor CPY shape tests to use dimension permutations Replace 75 hardcoded test cases with a loop over permutations of {3, 5, 7, 32} (total elements: 3360). Each src permutation is tested against canonical sorted and reverse dst, skipping identical shapes. Covers F32, F16, and Q4_0 (when both src and dst ne0 == 32). Assisted-by: llama.cpp:local pi --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 19 +++- ggml/src/ggml-metal/ggml-metal.metal | 6 +- tests/test-backend-ops.cpp | 130 ++++++++++++++++++------- 3 files changed, 113 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 8506000b6c05..206af227a2c6 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - const int nth = std::min(1024, ne0); + int nth = std::min(256, ne0); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + if (nth < 256) { + nrptg = std::min((256 + nth - 1) / nth, ne1); + if (nrptg * nth > 256) { + nrptg = 256 / nth; + } + } + + const int nw0 = (ne1 + nrptg - 1) / nrptg; + + ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1); return 1; } @@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nk0 = ne10/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne11, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4cf9dbea946c..e772664ba918 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -7486,7 +7486,11 @@ kernel void kernel_concat( const int i3 = tgpig.z; const int i2 = tgpig.y; - const int i1 = tgpig.x; + const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; + + if (i1 >= args.ne1) { + return; + } int o[4] = {0, 0, 0, 0}; o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 43343b67935a..303f5a40daf7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2866,15 +2866,24 @@ struct test_set : public test_case { struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; - const std::array ne; + const std::array ne_src; + const std::array ne_dst; const std::array permute_src; const std::array permute_dst; bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; + bool _use_dst_shape; std::string vars() override { - return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose); + if (_use_dst_shape) { + return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); + } + return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose); + } + + int64_t total_elements() const { + return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3]; } double max_nmse_err() override { @@ -2899,7 +2908,7 @@ struct test_cpy : public test_case { err_estimate /= 8.0f; } err_estimate *= err_estimate; - err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]); + err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements()); return err_estimate; } return 1e-6; @@ -2910,17 +2919,19 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne_src = {10, 10, 10, 1}, + std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, bool transpose_src = false) - : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), + : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), - _src_transpose(transpose_src){} + _src_transpose(transpose_src), + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); ggml_set_param(src); ggml_set_name(src, "src"); @@ -2934,7 +2945,8 @@ struct test_cpy : public test_case { ggml_set_name(src, "src_transposed"); } - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; + ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); ggml_set_name(dst, "dst"); if (_dst_use_permute) { @@ -8040,42 +8052,72 @@ static std::vector> make_test_cases_eval() { for (int k = 1; k < 4; ++k) { test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3})); } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : all_types) { for (ggml_type type_dst : {GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous } } test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + + // CPY - different src/dst shapes (reshaping via CPY) + // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360. + // Each src permutation is tested against canonical sorted and reverse dst (skip self). + { + std::array dims = {3, 5, 7, 32}; + std::sort(dims.begin(), dims.end()); + std::array canonical = dims; + std::array reversed = {32, 7, 5, 3}; + for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + std::array cur = dims; + do { + if (cur != canonical) { + test_cases.emplace_back(new test_cpy(type, type, cur, canonical)); + } + if (cur != reversed) { + test_cases.emplace_back(new test_cpy(type, type, cur, reversed)); + } + if (cur[0] == 32 && type == GGML_TYPE_F32) { + if (canonical[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical)); + } + if (reversed[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed)); + } + } + std::next_permutation(cur.begin(), cur.end()); + } while (cur != canonical); + } + } for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) { @@ -8830,9 +8872,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); + test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false)); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); test_cases.emplace_back(new test_roll()); @@ -9132,22 +9189,21 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); From b65bb4baaeae712975e09a92e1d37d3842ea5da7 Mon Sep 17 00:00:00 2001 From: ScrewTSW Date: Thu, 21 May 2026 13:29:13 +0200 Subject: [PATCH 2/3] server: expose prompt token counts in /slots endpoint (#23454) Add n_prompt_tokens, n_prompt_tokens_processed, and n_prompt_tokens_cache to the /slots JSON response. These fields are already tracked internally but were not exposed, making it impossible for clients to monitor prompt evaluation progress during processing. --- tools/server/server-context.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 80d77b0c06b9..b939e3b75ebf 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -506,6 +506,9 @@ struct server_slot { if (ptask) { res["id_task"] = ptask->id; + res["n_prompt_tokens"] = (int32_t) prompt.tokens.size(); + res["n_prompt_tokens_processed"] = n_prompt_tokens_processed; + res["n_prompt_tokens_cache"] = n_prompt_tokens_cache; res["params"] = ptask->params.to_json(only_metrics); res["next_token"] = { { From 40d5358d3c730b81729ba81cd5c44ed596d02510 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 14:41:50 +0300 Subject: [PATCH 3/3] tests : move save-load-state from examples to tests (#23336) * tests : move save-load-state from examples to tests - Move examples/save-load-state/ to tests/test-save-load-state.cpp - Remove subdirectory reference from examples/CMakeLists.txt - Add test to tests/CMakeLists.txt as a model test - Remove CODEOWNERS entry for removed example directory Assisted-by: llama.cpp:local pi * cont : update ci --- CODEOWNERS | 1 - ci/run.sh | 8 ++++---- examples/CMakeLists.txt | 1 - examples/save-load-state/CMakeLists.txt | 5 ----- tests/CMakeLists.txt | 4 ++++ .../save-load-state.cpp => tests/test-save-load-state.cpp | 0 6 files changed, 8 insertions(+), 11 deletions(-) delete mode 100644 examples/save-load-state/CMakeLists.txt rename examples/save-load-state/save-load-state.cpp => tests/test-save-load-state.cpp (100%) diff --git a/CODEOWNERS b/CODEOWNERS index f92fe98ce8c1..4b9d90177154 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -49,7 +49,6 @@ /examples/parallel/ @ggerganov /examples/passkey/ @ggerganov /examples/retrieval/ @ggerganov -/examples/save-load-state/ @ggerganov /examples/speculative-simple/ @ggerganov /examples/speculative/ @ggerganov /ggml/cmake/ @ggerganov diff --git a/ci/run.sh b/ci/run.sh index a8cbd3371d37..b096dc23b66f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b { (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a29dc707c3dc..39f802d250e1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -27,7 +27,6 @@ else() add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(retrieval) - add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt deleted file mode 100644 index 78024672e77d..000000000000 --- a/examples/save-load-state/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-save-load-state) -add_executable(${TARGET} save-load-state.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0fdbd39c94a5..33ae3b303cfc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}") set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model) +# Test state save/load functionality +llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}") +set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model) + if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading llama_build_and_test(test-barrier.cpp) diff --git a/examples/save-load-state/save-load-state.cpp b/tests/test-save-load-state.cpp similarity index 100% rename from examples/save-load-state/save-load-state.cpp rename to tests/test-save-load-state.cpp