support awq with qbits, only support sym (#402)

wenhuach21 · pre-commit-ci[bot] · web-flow · commit ba2426cd7221 · 2025-01-07T13:03:50.000+08:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
@@ -368,7 +368,6 @@ def detect_device(self, target_backend, orig_backend):
             return device
         else:
             return "cpu"
-        
 
     def convert_model(self, model: nn.Module):
         """Converts the given model to an AutoRound model by replacing its layers with quantized layers.
@@ -397,7 +396,7 @@ def convert_model(self, model: nn.Module):
             quantization_config.target_backend = quantization_config.backend
 
         target_device = self.detect_device(quantization_config.target_backend, quantization_config.backend)
-        
+
         self.target_device = target_device
 
         if hasattr(quantization_config, "backend"):  # pragma: no cover
@@ -416,7 +415,7 @@ def convert_model(self, model: nn.Module):
         
         quant_block_list = quantization_config.quant_block_list if hasattr(quantization_config,
                                                                                    "quant_block_list") else None
-        
+
         if quant_block_list is None:
             to_quant_block_names = quantization_config.to_quant_block_names if hasattr(quantization_config,
                                                                                    "to_quant_block_names") else None
@@ -564,7 +563,16 @@ def remove_device_str(s, device_str):
             layer_device = get_device(layer)
 
             bias = layer.bias is not None
-            if "awq" in layer_backend:
+            from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
+            if "awq" in layer_backend and isinstance(QuantLinear, QBitsAWQQuantLinear):
+                new_layer = QuantLinear.from_linear(  # pylint: disable=E1123
+                    layer,
+                    bits,
+                    group_size,
+                    init_only=True,
+                    has_zero_points=not sym
+                )
+            elif "awq" in layer_backend:
                 new_layer = QuantLinear.from_linear(  # pylint: disable=E1123
                     layer,
                     bits,
@@ -596,23 +604,18 @@ def remove_device_str(s, device_str):
             set_module(module, layer_name, new_layer)
 
     def cpu_post_init(self, model):
-        dep_check = True
         message = "Repacking to CPU format"
+        from auto_round_extension.qbits import qbits_qlinear_classes, qbits_awq_classes
+        from auto_round_extension.ipex import ipex_qlinear_classes
+        cpu_layers = tuple(list(qbits_qlinear_classes) + list(ipex_qlinear_classes) + list(qbits_awq_classes))
         layers = []  ## ipex post_init  will add one more layer
         for n, m in model.named_modules():
-            layers.append((n, m))
-
+            if isinstance(m, cpu_layers):
+                layers.append((n, m))
         for n, layer in tqdm(layers, desc=message, total=len(layers),
                              leave=True):
-            from auto_round_extension.qbits import qbits_qlinear_classes
-            from auto_round_extension.ipex import ipex_qlinear_classes
-            if isinstance(layer, qbits_qlinear_classes):
-                if dep_check:
-                    layer.req_check()
-                layer.post_init()
-                dep_check = False
-            if isinstance(layer, ipex_qlinear_classes):
-                layer.post_init()
+            layer.post_init()
+
 
         return model
 
@@ -758,5 +761,3 @@ def is_serializable(self):
 
 transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
 transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
-
-
diff --git a/auto_round/backend.py b/auto_round/backend.py
@@ -168,6 +168,14 @@ def check_auto_round_exllamav2_installed():
                                                   requirements=["intel-extension-for-transformers"]
                                                   )
 
+BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True],
+                                                  packing_format="awq",
+                                                  bits=[2, 4, 8], group_size=None,
+                                                  priority=0 if "intel" in get_cpu_manufacturer() else 5,
+                                                  feature_checks=[],
+                                                  requirements=["intel-extension-for-transformers"]
+                                                  )
+
 BackendInfos['auto_round:ipex_gptq'] = BackendInfo(device=["cpu"], sym=[True, False],
                                                    packing_format="ipex_gptq",
                                                    bits=[4], group_size=None,
@@ -317,6 +325,9 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym):
         if "zp" in backend:
             import auto_round_extension.qbits.qlinear_qbits_gptq as qlinear_qbits_gptq
             return qlinear_qbits_gptq.QuantLinear
+        elif "awq" in backend:
+            import auto_round_extension.qbits.qbits_awq as qlinear_qbits_awq
+            return qlinear_qbits_awq.QuantLinear
         else:  # auto_round must be at the end
             import auto_round_extension.qbits.qlinear_qbits as qlinear_qbits_autoround
             return qlinear_qbits_autoround.QuantLinear
diff --git a/auto_round_extension/qbits/__init__.py b/auto_round_extension/qbits/__init__.py
@@ -2,5 +2,8 @@
 from auto_round_extension.qbits.qlinear_qbits_gptq import (
     QuantLinear as QBitsGPTQQuantLinear,
 )
+from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
 
 qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear)
+
+qbits_awq_classes = (QBitsAWQQuantLinear,)
diff --git a/auto_round_extension/qbits/qbits_awq.py b/auto_round_extension/qbits/qbits_awq.py
@@ -0,0 +1,193 @@
+import torch
+import torch.nn as nn
+AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
+    shifts = torch.arange(0, 32, bits, device="cpu")
+
+    # unpacking columnwise
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8  # smallest dtype available
+    )
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    # unpacking columnwise
+    if qzeros is not None:
+        izeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
+            torch.int8  # smallest dtype available
+        )
+        izeros = izeros.view(izeros.shape[0], -1)
+    else:
+        izeros = qzeros
+
+    return iweights, izeros
+
+
+def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
+    reverse_order_tensor = torch.arange(
+        iweights.shape[-1],
+        dtype=torch.int32,
+        device="cpu",
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    if izeros is not None:
+        izeros = izeros[:, reverse_order_tensor]
+    iweights = iweights[:, reverse_order_tensor]
+    return iweights, izeros
+
+
+
+try:
+    from intel_extension_for_transformers import qbits  # with QBits kernels ()
+
+    QBITS_INSTALLED = True
+except:
+    QBITS_INSTALLED = False
+
+BITS_DTYPE_MAPPING = {
+    4: "int4_clip",
+    8: "int8",
+}
+
+
+def convert_dtype_torch2str(dtype):
+    if dtype == torch.int8:
+        return "int8"
+    elif dtype == torch.float:
+        return "fp32"
+    elif dtype == torch.float16:
+        return "fp16"
+    elif dtype == torch.bfloat16:
+        return "bf16"
+    elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]:
+        return dtype
+    else:
+        assert False, "Unsupported pytorch dtype {} to str dtype".format(dtype)
+
+
+class QuantLinear(nn.Module):
+
+    def __init__(self, w_bit, group_size, in_features, out_features, bias, zero_point, dev):
+        super().__init__()
+        assert QBITS_INSTALLED, \
+            "Please install ITREX qbits package with `pip install intel-extension-for-transformers`."
+
+        self.use_bf16 = qbits.check_isa_supported("AMX")
+
+        if w_bit not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2, 3, 4, 8 bits are supported for now.")
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+        self.zero_point = zero_point
+        self.scale_dtype = torch.float32
+
+        # quick sanity check (make sure alignment)
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+        self.pack_num = 32 // self.w_bit
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (in_features // self.group_size, out_features // self.pack_num),
+                dtype=torch.int8,
+                device=dev,
+            )
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (in_features // self.group_size, out_features),
+                dtype=torch.bfloat16 if self.use_bf16 else torch.float32,
+                device=dev,
+            ))
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros((out_features), dtype=torch.bfloat16 if self.use_bf16 else torch.float32, device=dev),
+            )
+        else:
+            self.register_buffer(
+                "bias",
+                None,
+            )
+        qweight = torch.zeros((in_features, out_features // self.pack_num), dtype=torch.int32, device=dev)
+        self.register_buffer("qweight", qweight)
+
+    def post_init(self):
+        assert self.qweight.device.type == "cpu"
+
+        intweight, zeros = unpack_awq(self.qweight, self.qzeros, self.w_bit)  # weight: k x n zeros: k / group_size x n
+        intweight, zeros = reverse_awq_order(intweight, zeros, self.w_bit)  # weight: k x n zeros: k / group_size x n
+        if self.zero_point: ## asym has accuracy issue, have not root caused yet
+            intweight = torch.bitwise_and(intweight, (2 ** self.w_bit) - 1) - (2 ** (self.w_bit - 1))
+            zeros = torch.bitwise_and(zeros, (2 ** self.w_bit) - 1) - (2 ** (self.w_bit - 1))
+        else:
+            ##symmetric, our default zp is 8
+            intweight = torch.bitwise_and(intweight, (2 ** self.w_bit) - 1) - (2 ** (self.w_bit - 1))
+        g_idx = torch.empty(0, dtype=torch.int32)
+        self.qweight = qbits.repack_quantized_weight(intweight, self.scales.float(), zeros, g_idx,
+                                                     BITS_DTYPE_MAPPING[self.w_bit],
+                                                     convert_dtype_torch2str(self.scale_dtype),
+                                                     convert_dtype_torch2str(self.scales.dtype), self.zero_point,
+                                                     self.group_size)
+
+
+
+    @classmethod
+    def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None, has_zero_points=False):
+        awq_linear = cls(
+            w_bit,
+            group_size,
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            has_zero_points,
+            linear.weight.device,
+        )
+        if init_only:  # just prepare for loading sd
+            return awq_linear
+
+        raise NotImplementedError("Only inference is supported for Exllama kernels")
+
+    @torch.no_grad()
+    def forward(self, x):
+        assert QBITS_INSTALLED, (
+            "QBits kernels could not be loaded. "
+            "Please install with `pip install intel-extension-for-transformers` and "
+            "refer to the detail https://github.com/intel/intel-extension-for-transformers/blob/main/docs/qbits.md")
+
+        input_dtype = x.dtype
+        out_shape = x.shape[:-1] + (self.out_features,)
+        x = x.view(-1, x.shape[-1])  # convert xd to 2d
+        out_2d_shape = x.shape[:-1] + (self.out_features,)
+
+        outputs = torch.zeros(out_2d_shape, dtype=input_dtype)
+        bias = self.bias if self.bias is not None else torch.empty(
+            0, dtype=torch.bfloat16 if self.use_bf16 else torch.float32)
+
+        qbits.woq_linear(x, self.qweight, bias, outputs, convert_dtype_torch2str(input_dtype),
+                         BITS_DTYPE_MAPPING[self.w_bit], convert_dtype_torch2str(self.scale_dtype), True)
+
+        return outputs.view(out_shape)
+
+    def extra_repr(self) -> str:
+        return ("in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.w_bit,
+            self.group_size,
+        ))
+
+
+def qbits_post_init(model):
+    for _, submodule in model.named_modules():
+        if isinstance(submodule, QuantLinear):
+            submodule.post_init()
+
+    return model

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,8 @@`
`2`	`2`	`from auto_round_extension.qbits.qlinear_qbits_gptq import (`
`3`	`3`	`QuantLinear as QBitsGPTQQuantLinear,`
`4`	`4`	`)`
	`5`	`+from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear`
`5`	`6`
`6`	`7`	`qbits_qlinear_classes = (QBitsQuantLinear, QBitsGPTQQuantLinear)`
	`8`	`+`
	`9`	`+qbits_awq_classes = (QBitsAWQQuantLinear,)`