Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4693,7 +4693,7 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
GGML_UNUSED(backend_ctx);
int ne01 = tensor->ne[1];
return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0);
}

inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
Expand Down Expand Up @@ -14297,7 +14297,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -14513,7 +14513,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -14689,7 +14689,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -14865,7 +14865,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -15118,7 +15118,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -15291,7 +15291,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -15469,7 +15469,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down Expand Up @@ -15644,7 +15644,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
CL_CHECK(status);

// set thread grid
global_size[0] = static_cast<size_t>(ne01);
global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
global_size[1] = 4;
global_size[2] = static_cast<size_t>(ne20);
local_size[1] = 4;
Expand Down
64 changes: 64 additions & 0 deletions ggml/src/ggml-opencl/kernels/cvt.cl
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK4_0;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK4_0;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK4_1;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK4_1;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK5_0;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK5_0;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK5_1;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK5_1;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -679,6 +711,10 @@ kernel void kernel_convert_block_q4_k_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -732,6 +768,10 @@ kernel void kernel_restore_block_q4_k_trans4_ns(
uint i01 = get_global_id(0); // row index
uint i02 = get_global_id(2); // batch index

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;

uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -784,6 +824,10 @@ kernel void kernel_convert_block_q5_k_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -850,6 +894,10 @@ kernel void kernel_restore_block_q5_k_trans4_ns(
uint i01 = get_global_id(0); // row index
uint i02 = get_global_id(2); // batch index

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;

uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -916,6 +964,10 @@ kernel void kernel_convert_block_q6_k_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;

uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -993,6 +1045,10 @@ kernel void kernel_restore_block_q6_k_trans4_ns(
uint i01 = get_global_id(0); // row index
uint i02 = get_global_id(2); // batch index

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_K;

uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -1147,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_MXFP4;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down Expand Up @@ -1190,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

if (i01 >= ne01) {
return;
}

uint ne00_blk = ne00 / QK_MXFP4;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load poster router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load poster router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load poster router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -225,6 +225,10 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load post router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load poster router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load poster router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
uint block_id_n = get_global_id(2); // n_tile

// Boundary check
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
if (block_id_n >= total_tiles[0]) {
return;
}

Expand Down Expand Up @@ -230,6 +230,10 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
}

if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
return;
}

// Load post router and share in LM
__local uint out_idx[TILESIZE_N];

Expand Down
Loading
Loading