make fp8 blockwise linear differentiable; use new kernels

danielvegamyhre · danielvegamyhre · commit faaf1ac9525a · 2025-07-24T20:03:43.000-07:00
diff --git a/test/prototype/blockwise_fp8/test_blockwise_linear.py b/test/prototype/blockwise_fp8/test_blockwise_linear.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from torchao.prototype.blockwise_fp8.blockwise_linear import BlockwiseQuantLinear
+from torchao.float8.float8_utils import compute_error
+
+triton = pytest.importorskip("triton", reason="Triton required to run this test")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("in_features", [1024])
+@pytest.mark.parametrize("out_features", [1024])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("block_size", [128])
+def test_blockwise_quant_linear_fwd_bwd(
+    in_features, out_features, batch_size, block_size,
+):
+    if in_features % block_size != 0 or out_features % block_size != 0:
+        pytest.skip(f"Dimensions must be divisible by block_size={block_size}")
+
+    torch.random.manual_seed(0)
+    layer_test = BlockwiseQuantLinear(
+        in_features=in_features,
+        out_features=out_features,
+        block_size=block_size,
+    ).cuda()
+
+    torch.random.manual_seed(0)
+    layer_ref = torch.nn.Linear(
+        in_features=in_features,
+        out_features=out_features,
+    ).cuda()
+
+
+    # Create input tensor
+    x_test = torch.randn(batch_size, in_features).cuda()
+    x_ref = x_test.clone().detach().requires_grad_(True)
+
+    # Forward pass
+    y_test = layer_test(x_test)
+    y_ref = layer_ref(x_ref)
+
+    # Compare outputs
+    sqnr = compute_error(y_test, y_ref)
+    breakpoint()
+    assert sqnr >= 25.0, f"SQNR: {sqnr} must be >= 25.0"
+
+    # # Backward pass
+    # y_test.sum().backward()
+    # y_ref.sum().backward()
+
+    # # Compare input grads
+    # sqnr = compute_error(x_test.grad, x_ref.grad)
+    # assert sqnr >= 25.0, f"SQNR: {sqnr} must be >= 25.0"
+
+    # # Compare weight grads
+    # sqnr = compute_error(layer_test.weight, layer_ref.weight)
+    # assert sqnr >= 25.0, f"SQNR: {sqnr} must be >= 25.0"
diff --git a/test/prototype/blockwise_fp8/test_fp8_blockwise_kernels.py b/test/prototype/blockwise_fp8/test_fp8_blockwise_kernels.py
diff --git a/torchao/prototype/blockwise_fp8/blockwise_linear.py b/torchao/prototype/blockwise_fp8/blockwise_linear.py
@@ -9,11 +9,12 @@
 
 from torchao.prototype.blockwise_fp8.kernels import (
     blockwise_fp8_gemm,
-    fp8_blockwise_act_quant,
+    torch_blockwise_scale_act_quant,
+    triton_quantize_fp8_block,
 )
 
 
-class BlockwiseQuantLinear(nn.Module):
+class BlockwiseQuantLinear(nn.Linear):
     """
     Custom linear layer with support for quantized weights and optional bias.
 
@@ -24,54 +25,81 @@ class BlockwiseQuantLinear(nn.Module):
         block_size (int): Block size for quantization. Defaults to 128.
         dtype (torch.dtype): Data type for the weights. Defaults to torch.float8_e4m3fn.
     """
-
-    dtype = torch.bfloat16
+    supported_dtypes = [
+        torch.bfloat16,
+    ]
 
     def __init__(
         self,
-        in_features: int,
-        out_features: int,
-        bias: bool = False,
+        *args,
         block_size: int = 128,
-        dtype: torch.dtype = torch.float8_e4m3fn,
+        dtype = torch.bfloat16,
+        **kwargs,
     ):
-        super().__init__()
-        supported_dtypes = [
-            torch.float8_e4m3fn,
-            torch.float8_e5m2,
-        ]
-        assert dtype in supported_dtypes, (
-            f"Unsupported dtype: {dtype}. Supported dtypes: {supported_dtypes}"
-        )
-        scale_in_features = (in_features + block_size - 1) // block_size
-        scale_out_features = (out_features + block_size - 1) // block_size
-        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype))
-        self.weight.scale = self.scale = nn.Parameter(
-            torch.empty(scale_out_features, scale_in_features, dtype=torch.float32)
+        super().__init__(*args, **kwargs)
+
+        assert dtype in self.supported_dtypes, (
+            f"Unsupported dtype: {dtype}. Supported dtypes: {self.supported_dtypes}"
         )
         self.block_size = block_size
-        self.dtype
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_features))
-        else:
-            self.register_parameter("bias", None)
+        self.dtype = dtype
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass for the custom linear layer.
 
         Args:
-            x (torch.Tensor): Input tensor.
+            x (torch.Tensor): input tensor.
 
         Returns:
             torch.Tensor: Transformed tensor after linear computation.
         """
-        x, scale = fp8_blockwise_act_quant(x, self.block_size, self.dtype)
+        return fp8_blockwise_mm.apply(x, self.weight, self.block_size)
+
+
+class fp8_blockwise_mm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, block_size):
+        # torch.compile currently has the fastest activation quantization (1 x block_size)
+        x_fp8, x_scale = torch_blockwise_scale_act_quant(x, tile_size=block_size)
+
+        # fbgemm currently has the fastest weight quantization (block_size x block_size)
+        weight_fp8, weight_scale = triton_quantize_fp8_block(weight, block_m=block_size, block_k=block_size)
+
         y = blockwise_fp8_gemm(
-            x, scale, self.weight, self.weight.scale, self.block_size
+            x_fp8, x_scale, 
+            weight_fp8, weight_scale, 
+            block_size,
         )
-
-        if self.bias is not None:
-            y += self.bias
+        ctx.save_for_backward(x_fp8, x_scale, weight_fp8, weight_scale)
+        ctx.block_size = block_size
         return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x_fp8, x_scale, weight_fp8, weight_scale = ctx.saved_tensors
+        block_size = ctx.block_size
+
+        grad_output_fp8, grad_output_scale = torch_blockwise_scale_act_quant(
+            grad_output, block_size,
+        )
+
+        grad_output_t_fp8, grad_output_t_scale = torch_blockwise_scale_act_quant(
+            grad_output.t(), block_size,
+        )
+
+        # grad_x = grad_output @ weight.T
+        grad_x = blockwise_fp8_gemm(
+            grad_output_fp8, grad_output_scale, 
+            weight_fp8.t(), weight_scale.t(), 
+            block_size,
+        )
+
+        # grad_weight = grad_output.T @ x
+        grad_weight = blockwise_fp8_gemm(
+            grad_output_t_fp8, grad_output_t_scale, 
+            x_fp8, x_scale, 
+            block_size,
+        )
+
+        return grad_x, grad_weight, None, None
diff --git a/torchao/prototype/blockwise_fp8/kernels.py b/torchao/prototype/blockwise_fp8/kernels.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 import math
 from typing import Optional, Tuple
 
@@ -12,6 +13,12 @@
 import triton.language as tl
 from triton import Config
 
+# try:
+#     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import triton_quantize_fp8_block
+# except ImportError:
+#     print("Please install fbgemm-gpu to use this feature")
+#     sys.exit(1)
+
 # Original implementation at https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
 
 fp8_gemm_configs = [