merge main

brian-dellabetta · brian-dellabetta · commit 712a73177de4 · 2025-08-28T12:57:56.000-04:00
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -89,6 +89,16 @@ jobs:
               with:
                   python-version: ${{ inputs.python }}
 
+            - name: install system dependencies
+              run: |
+                if command -v g++ >/dev/null 2>&1; then
+                  echo "found g++ compiler"
+                else
+                   echo "installing g++ etc compilers..."
+                   sudo apt update && sudo apt install -y g++ gcc
+                fi
+              shell: bash
+
             - name: checkout code
               id: checkout
               uses: actions/checkout@v4
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -316,10 +316,10 @@ def __init__(
 
             self.quantization_compressor = {}
             for format in self.compression_formats:
-                self.quantization_compressor[
-                    format
-                ] = BaseCompressor.load_from_registry(
-                    format, config=quantization_config
+                self.quantization_compressor[format] = (
+                    BaseCompressor.load_from_registry(
+                        format, config=quantization_config
+                    )
                 )
 
     # ----- used by hf quantizer ----- #
@@ -705,9 +705,12 @@ def decompress(self, model_path: str, model: Module):
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
-                names_to_scheme = apply_quantization_config(
-                    model, self.quantization_config
-                )
+                apply_quantization_config(model, self.quantization_config)
+                names_to_scheme: Set[QuantizationScheme] = {
+                    name: getattr(module, "quantization_scheme")
+                    for name, module in model.named_modules()
+                    if getattr(module, "quantization_scheme", None) is not None
+                }
                 # Load activation scales/zp or any other quantization parameters
                 # Conditionally load the weight quantization parameters if we have a
                 # dense compressor or if a sparsity compressor has already been applied
diff --git a/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py
@@ -123,6 +123,7 @@ def decompress_weight(
         return decompressed_weight
 
 
+@torch.compile(fullgraph=True, dynamic=True)
 def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     """
     Packs a tensor with values in the fp4 range into uint8.
@@ -145,12 +146,11 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
 
     # Find closest valid FP4 value index for each element
     abs_x = torch.abs(x)
-    abs_indices = torch.zeros_like(abs_x, dtype=torch.long)
-    for i, val in enumerate(kE2M1):
-        abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices)
+    abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE2M1)  # [m, n, 8]
+    abs_indices = torch.argmin(abs_diff_x, dim=-1)  # [m, n]
 
     # Apply sign bit (bit 3) to get final 4-bit representation
-    indices = abs_indices + (torch.signbit(x) << 3).to(torch.long)
+    indices = abs_indices + (torch.signbit(x).to(torch.long) << 3)
 
     # Reshape to prepare for packing pairs of values
     indices = indices.reshape(-1)
@@ -174,6 +174,7 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
 
 
 # reference: : https://github.com/vllm-project/vllm/pull/16362
+@torch.compile(fullgraph=True, dynamic=True)
 def unpack_fp4_from_uint8(
     a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
 ) -> torch.Tensor:
diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -115,7 +115,7 @@ def load_pretrained_quantization_parameters(
 
 def apply_quantization_config(
     model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
-) -> Dict[str, QuantizationScheme]:
+):
     """
     Initializes the model for quantization in-place based on the given config.
     Optionally coverts quantizable modules to compressed_linear modules
@@ -125,19 +125,17 @@ def apply_quantization_config(
     :param run_compressed: Whether the model will be run in compressed mode or
         decompressed fully on load
     """
-    # Workaround for when HF Quantizer passes None, see PR #180
-    if config is None:
-        return dict()
 
-    # remove reference to the original `config`
-    # argument. This function can mutate it, and we'd
-    # like to keep the original `config` as it is.
     config = deepcopy(config)
+    if config is None:  # see PR #180
+        return dict()
+
+    # preprocess to support kv cache scheme
+    config = process_quantization_config(config)
+
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
     target_to_scheme = OrderedDict()
-    config = process_quantization_config(config)
-    names_to_scheme = dict()
     for scheme in config.config_groups.values():
         for target in scheme.targets:
             target_to_scheme[target] = scheme
@@ -150,13 +148,20 @@ def apply_quantization_config(
         # quant scheme to the matching layers
         matched_targets = match_targets(name, submodule, target_to_scheme)
         scheme = _scheme_from_targets(target_to_scheme, matched_targets, name)
+
+        # target matched - add layer and scheme to target list
+        submodule.quantization_scheme = scheme
+
+        # replace with run compressed if applicable
+        # FUTURE: move this to model compressor
         if (
             run_compressed
-            and config.format != CompressionFormat.dense.value
             and isinstance(submodule, torch.nn.Linear)
+            and config.format != CompressionFormat.dense.value
         ):
             from compressed_tensors.linear.compressed_linear import CompressedLinear
 
+            # TODO: expand to more module types
             compressed_linear = CompressedLinear.from_linear(
                 submodule,
                 quantization_scheme=scheme,
@@ -167,13 +172,9 @@ def apply_quantization_config(
         # target matched - add layer and scheme to target list
         submodule.quantization_scheme = scheme
 
-        names_to_scheme[name] = submodule.quantization_scheme
-
         # apply current quantization status to each targeted submodule
         apply_quantization_status(submodule, config.quantization_status)
 
-    return names_to_scheme
-
 
 def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
     """