Skip to content

Commit 6015feb

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 4b610c7 commit 6015feb

File tree

1 file changed

+58
-59
lines changed

1 file changed

+58
-59
lines changed

neural_compressor/adaptor/ox_utils/weight_only.py

+58-59
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ra
246246

247247
return q_weight, scale, zero_point
248248

249+
249250
def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
250251
"""Quantize tensor per group based on k quant.
251252
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
@@ -260,44 +261,44 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
260261
scale: scale
261262
zero_point: zero point
262263
"""
263-
data = np.reshape(data, (-1, group_size)).astype(np.float32) # (nb, group_size)
264+
data = np.reshape(data, (-1, group_size)).astype(np.float32) # (nb, group_size)
264265
maxq = 2**num_bits - 1
265266
minq = 0
266-
sum_x2 = np.sum(data**2, axis=1, keepdims=True) # (nb, 1)
267-
av_x = np.sqrt(sum_x2 / group_size) # (nb, 1)
268-
weights = np.add(av_x, np.abs(data)) # (nb, group_size)
269-
rmin = np.min(data, axis=1, keepdims=True) # (nb, 1)
270-
rmax = np.max(data, axis=1, keepdims=True) # (nb, 1)
271-
sum_w = np.sum(weights, axis=1, keepdims=True) # (nb, 1)
272-
sum_x = np.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
273-
iscale = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
267+
sum_x2 = np.sum(data**2, axis=1, keepdims=True) # (nb, 1)
268+
av_x = np.sqrt(sum_x2 / group_size) # (nb, 1)
269+
weights = np.add(av_x, np.abs(data)) # (nb, group_size)
270+
rmin = np.min(data, axis=1, keepdims=True) # (nb, 1)
271+
rmax = np.max(data, axis=1, keepdims=True) # (nb, 1)
272+
sum_w = np.sum(weights, axis=1, keepdims=True) # (nb, 1)
273+
sum_x = np.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
274+
iscale = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
274275
mask = rmin != rmax
275276
iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
276277
scale = 1 / iscale
277-
quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
278-
diff = scale * quant_data + rmin - data # (nb, group_size)
279-
best_mad = np.sum(weights * diff ** 2, axis=1, keepdims=True) # (nb, 1)
278+
quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
279+
diff = scale * quant_data + rmin - data # (nb, group_size)
280+
best_mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
280281
nstep = 20
281282
rdelta = 0.1
282283
# nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
283284
rrmin = -1
284285
for is_ in range(nstep):
285-
iscale_new = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
286+
iscale_new = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
286287
factor = np.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
287288
mask = rmin != rmax
288289
iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
289-
quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
290+
quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
290291
mul_weights_quant_data_new = weights * quant_data_new
291-
sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
292-
sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
293-
sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
294-
D = np.subtract(sum_w * sum_l2, sum_l ** 2) # (nb, 1)
292+
sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
293+
sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
294+
sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
295+
D = np.subtract(sum_w * sum_l2, sum_l**2) # (nb, 1)
295296

296-
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
297-
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
297+
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
298+
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
298299

299-
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
300-
mad = np.sum(weights * diff ** 2, axis=1, keepdims=True) # (nb, 1)
300+
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
301+
mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
301302

302303
mad_1 = np.array(mad)
303304
best_mad_1 = np.array(best_mad)
@@ -307,7 +308,7 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
307308
scale[idx_to_replace] = this_scale[idx_to_replace]
308309
rmin[idx_to_replace] = this_min[idx_to_replace]
309310

310-
zero_point = np.clip((( - rmin) / scale).round(), 0, maxq).astype("uint8")
311+
zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
311312
scale = scale.astype(np.float64)
312313
q_weight = np.empty_like(data, dtype=scale.dtype)
313314
np.divide(data, scale, out=q_weight)
@@ -317,6 +318,7 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
317318

318319
return q_weight, scale, zero_point
319320

321+
320322
def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
321323
"""Quantize tensor per group based on k quant.
322324
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
@@ -334,46 +336,47 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
334336
try:
335337
import cupy as cp
336338
import torch
339+
337340
if torch.cuda.is_available():
338341
data = cp.asarray(data)
339-
data = data.reshape((-1, group_size)).astype(np.float32) # (nb, group_size)
342+
data = data.reshape((-1, group_size)).astype(np.float32) # (nb, group_size)
340343
nb = data.shape[0]
341344
maxq = 2**num_bits - 1
342345
minq = 0
343-
sum_x2 = np.sum(data**2, axis=1, keepdims=True) # (nb, 1)
344-
av_x = np.sqrt(sum_x2 / group_size) # (nb, 1)
345-
weights = np.add(av_x, np.abs(data)) # (nb, group_size)
346-
rmin = np.min(data, axis=1, keepdims=True) # (nb, 1)
347-
rmax = np.max(data, axis=1, keepdims=True) # (nb, 1)
348-
sum_w = np.sum(weights, axis=1, keepdims=True) # (nb, 1)
349-
sum_x = np.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
350-
iscale = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
346+
sum_x2 = np.sum(data**2, axis=1, keepdims=True) # (nb, 1)
347+
av_x = np.sqrt(sum_x2 / group_size) # (nb, 1)
348+
weights = np.add(av_x, np.abs(data)) # (nb, group_size)
349+
rmin = np.min(data, axis=1, keepdims=True) # (nb, 1)
350+
rmax = np.max(data, axis=1, keepdims=True) # (nb, 1)
351+
sum_w = np.sum(weights, axis=1, keepdims=True) # (nb, 1)
352+
sum_x = np.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
353+
iscale = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
351354
mask = rmin != rmax
352355
iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
353356
scale = 1 / iscale
354-
quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
355-
diff = scale * quant_data + rmin - data # (nb, group_size)
356-
best_mad = np.sum(weights * diff ** 2, axis=1, keepdims=True) # (nb, 1)
357+
quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
358+
diff = scale * quant_data + rmin - data # (nb, group_size)
359+
best_mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
357360
nstep = 20
358361
rdelta = 0.1
359362
rrmin = -1
360363
for is_ in range(nstep):
361-
iscale_new = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
364+
iscale_new = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
362365
factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
363366
mask = rmin != rmax
364367
iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
365-
quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
368+
quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
366369
mul_weights_quant_data_new = weights * quant_data_new
367-
sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
368-
sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
369-
sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
370-
D = np.subtract(sum_w * sum_l2, sum_l ** 2) # (nb, 1)
370+
sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
371+
sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
372+
sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
373+
D = np.subtract(sum_w * sum_l2, sum_l**2) # (nb, 1)
371374

372-
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
373-
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
375+
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
376+
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
374377

375-
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
376-
mad = np.sum(weights * diff ** 2, axis=1, keepdims=True) # (nb, 1)
378+
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
379+
mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
377380

378381
mad_1 = cp.array(mad)
379382
best_mad_1 = cp.array(best_mad)
@@ -383,7 +386,7 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
383386
scale[idx_to_replace] = this_scale[idx_to_replace]
384387
rmin[idx_to_replace] = this_min[idx_to_replace]
385388

386-
zero_point = np.clip((( - rmin) / scale).round(), 0, maxq).astype("uint8")
389+
zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
387390
scale = scale.astype(np.float64)
388391
q_weight = np.empty_like(data, dtype=scale.dtype)
389392
np.divide(data, scale, out=q_weight)
@@ -393,20 +396,18 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
393396

394397
return q_weight.get(), scale.get(), zero_point.get()
395398
else:
396-
logger.warning("Try to use k-quant quantization on CUDA. However, CUDA is not available." \
397-
"Fall back to k-quant quantization on CPU.")
398-
return quant_tensor_k_quant_cpu(
399-
data, num_bits, group_size
399+
logger.warning(
400+
"Try to use k-quant quantization on CUDA. However, CUDA is not available."
401+
"Fall back to k-quant quantization on CPU."
400402
)
403+
return quant_tensor_k_quant_cpu(data, num_bits, group_size)
401404
except ImportError:
402405
logger.info(
403-
"Now we are using k-quant quantization on cpu, which is time consuming." \
404-
"Please consider install cupy to speed up on CUDA. See https://cupy.dev/" \
405-
"Please also install torch to check CUDA availablity."
406-
)
407-
return quant_tensor_k_quant_cpu(
408-
data, num_bits, group_size
406+
"Now we are using k-quant quantization on cpu, which is time consuming."
407+
"Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
408+
"Please also install torch to check CUDA availability."
409409
)
410+
return quant_tensor_k_quant_cpu(data, num_bits, group_size)
410411

411412

412413
def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
@@ -536,9 +537,7 @@ def rtn_quantize(
536537
# MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
537538
# MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
538539
if algorithm == "k_quant":
539-
q_weight, scale, zp = quant_tensor_k_quant_cuda(
540-
weight.T, num_bits, group_size
541-
)
540+
q_weight, scale, zp = quant_tensor_k_quant_cuda(weight.T, num_bits, group_size)
542541
else:
543542
q_weight, scale, zp = quant_tensor(
544543
weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)

0 commit comments

Comments
 (0)