Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,15 +710,15 @@ def _generate_nvfp4_tensors(self):
self._repack_nvfp4(name, weight, scale, scale2, input_scale)

# Flush any remaining experts (fallback if n_experts was unknown)
for bid, proj_type in expert_blocks.keys():
for bid, proj_type in list(expert_blocks.keys()):
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)

# Remove consumed tensors so get_tensors/modify_tensors won't see them
for name in consumed:
self.model_tensors.pop(name, None)

# Remove any remaining unused auxiliary tensors
for name in self.model_tensors.keys():
for name in list(self.model_tensors.keys()):
if name.endswith((".k_scale", ".v_scale")):
del self.model_tensors[name]

Expand Down
4 changes: 2 additions & 2 deletions examples/sycl/start-svr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,14 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
echo "Use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
GPUS_SETTING="-sm ${SPLIT_MODE}"
fi

echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000"
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000


2 changes: 1 addition & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ extern "C" {
// device type
enum ggml_backend_dev_type type;
// device id
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
// for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
// if the id is unknown, this should be NULL
const char * device_id;
// device capabilities
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
if (sched->debug > 1) {
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
for (int j = 0; j < GGML_MAX_SRC; j++) {
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5434,6 +5434,9 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
char pci_bus_id[32] = {};
CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
dev_ctx->pci_bus_id = pci_bus_id;
for (char & c : dev_ctx->pci_bus_id) {
c = std::tolower(c);
}
dev_ctx->op_offload_min_batch_size = min_batch_size;

ggml_backend_dev_t dev = new ggml_backend_device {
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ set(GGML_OPENCL_KERNELS
mul_mv_id_q8_0_f32_flat
mul_mv_id_mxfp4_f32
mul_mv_id_mxfp4_f32_flat
gemm_moe_q4_0_f32_ns
gemv_moe_q4_0_f32_ns
gemm_moe_mxfp4_f32
gemv_moe_mxfp4_f32
gemm_moe_mxfp4_f32_ns
Expand Down
296 changes: 287 additions & 9 deletions ggml/src/ggml-opencl/ggml-opencl.cpp

Large diffs are not rendered by default.

86 changes: 86 additions & 0 deletions ggml/src/ggml-opencl/kernels/cvt.cl
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,92 @@ kernel void kernel_restore_block_q4_0_noshuffle(
}
}

kernel void kernel_convert_block_q4_0_trans4_ns(
global struct block_q4_0 * src0,
__global uint * dst_q,
__global half * dst_d,
uint ne00,
uint ne01
) {
uint i00 = get_global_id(1);
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

uint ne00_blk = ne00 / QK4_0;
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;

global struct block_q4_0 * b = src0 + src_blk_offset;
dst_d[dst_blk_offset] = b->d;

// extract quantization and unshuffle
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];

ushort8 post_block = (ushort8)(0);

uchar * pre_block_ptr = (uchar *)(&pre_block);
uchar * post_block_ptr = (uchar *)(&post_block);

for (int i = 0; i < QK4_0 / 4; ++i) {
uchar x0 = pre_block_ptr[2*i + 0];
uchar x1 = pre_block_ptr[2*i + 1];

post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
}

uint4 q_block = as_uint4(post_block);

uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
dst_q[offset] = q_block.x;
dst_q[offset + ne01] = q_block.y;
dst_q[offset + ne01 * 2] = q_block.z;
dst_q[offset + ne01 * 3] = q_block.w;
}

kernel void kernel_restore_block_q4_0_trans4_ns(
__global uint * src_q,
__global half * src_d,
__global struct block_q4_0 * dst0,
uint ne00,
uint ne01
) {
uint i00 = get_global_id(1);
uint i01 = get_global_id(0);
uint i02 = get_global_id(2);

uint ne00_blk = ne00 / QK4_0;
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;

__global struct block_q4_0 * b = dst0 + dst_blk_offset;
b->d = src_d[src_d_offset];

// collect transposed quantization parts for a block
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
uint4 q_block;
q_block.x = src_q[src_q_offset];
q_block.y = src_q[src_q_offset + ne01];
q_block.z = src_q[src_q_offset + ne01 * 2];
q_block.w = src_q[src_q_offset + ne01 * 3];

ushort8 post_block = as_ushort8(q_block);
ushort8 pre_block = (ushort8)(0);

uchar * pre_block_ptr = (uchar *)(&pre_block);
uchar * post_block_ptr = (uchar *)(&post_block);

for (int i = 0; i < QK4_0 / 4; ++i) {
uchar x0 = post_block_ptr[i + 0];
uchar x1 = post_block_ptr[i + QK4_0 / 4];

pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
}

((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
}

//------------------------------------------------------------------------------
// kernel_convert_block_q4_1
// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
Expand Down
Loading
Loading