add splitK support

mobicham · mobicham · commit dba6c9a12055 · 2024-10-18T19:32:23.000Z
diff --git a/README.md b/README.md
@@ -66,7 +66,9 @@ We implement two versions of the Triton kernels:
 
 * <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemm_A16fWnO16f_int32packing.py">GEMM</a></b>: This GEMM kernel is implemented similarly to <a href="https://github.com/fpgaminer/GPTQ-triton">GPTQ-triton</a>. Since it uses tensor cores, activations must be padded with zeros along the batch dimension to fit at least 16 rows. It supports both float32 and float16 accumulation for fp16 inputs, but only float32 accumulation for bfloat16.
 
-Both kernels are flexible, supporting 8, 4, 2, and 1-bit weight precisions.
+* <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemm_splitK_A16fWnO16f_int32packing.py">Split-K</a></b>: This Split-KGEMM kernel is implemented similarly to <a href="https://github.com/foundation-model-stack/foundation-model-stack/blob/triton/triton/kernels/gptq/splitk_dequant_gemm.py">the gptq Split-K version</a>. We build on the gemm version above and add another dimension in the grid which splits the K dimension into multiple jobs that calculate partial sums, which are atomically added and finally stored. Split-K performs very well for batch-sizes between 1 and 32, which is great for LLM decoding. 
+
+All kernels are flexible, supporting 8, 4, 2, and 1-bit weight precisions.
 
 To achieve optimal performance, it’s crucial to configure the eviction policy correctly. This is especially important in memory-bound scenarios, where we aim to cache activations by setting `eviction_policy="evict_last"`. Float16 accumulation further improves performance in compute-bound scenarios. 
 
diff --git a/gemlite/core.py b/gemlite/core.py
@@ -152,37 +152,22 @@ def forward(self, x):
 ###################################################################################################################################
 # Triton backend
 ###################################################################################################################################
-def eval_time(fct, params, warmup=25, rep=200, fast_flush=True, return_mode="min"):
-    if isinstance(params, dict):
-        return do_bench(
-            lambda: fct(**params),
-            warmup=warmup,
-            rep=rep,
-            fast_flush=fast_flush,
-            return_mode=return_mode,
-        )
-    if isinstance(params, list):
-        return do_bench(
-            lambda: fct(*params),
-            warmup=warmup,
-            rep=rep,
-            fast_flush=fast_flush,
-            return_mode=return_mode,
-        )
+def eval_time_for_auto_mode(fct, params):
+    for _ in range(5):
+        _ = fct(*params) #Run first to kick-off Triton autotune
+    return do_bench(lambda: fct(*params), warmup=200, rep=50, fast_flush=True, return_mode='mean')
 
 
 GEMLITE_TRITON_CACHE = {}
 
 GEMLITE_TRITON_MAPPING = {
     ("fp16", "GEMV"): gemv_A16fWnO16f_int32packing,
     ("fp16", "GEMM"): gemm_A16fWnO16f_int32packing,
-    #("fp16", "GEMM_SPLITK"): gemm_splitK_A16fWnO16f_int32packing,
-
+    ("fp16", "GEMM_SPLITK"): gemm_splitK_A16fWnO16f_int32packing,
     ("bf16", "GEMM"): gemm_A16fWnO16f_int32packing,
 }
 
 def get_closest_m(M):
-    #return M if M <= 8 else 2 ** int(math.ceil(math.log2(M)))
     return 2 ** int(math.ceil(math.log2(M)))
 
 # Triton
@@ -196,6 +181,7 @@ def __init__(
         input_dtype = DType.FP16,
         output_dtype = DType.FP16,
         acc_dtype = DType.FP32,
+        exhaustive=False
     ):
         self._SUPPORTED_BITS_TRITON = [1, 2, 4, 8]
 
@@ -219,7 +205,7 @@ def __init__(
 
         self.compute_dtype = None
         if input_dtype == DType.FP16 and output_dtype == DType.FP16:
-            self.kernels = [gemm_A16fWnO16f_int32packing, gemv_A16fWnO16f_int32packing] #gemm_splitK_A16fWnO16f_int32packing
+            self.kernels = [gemm_A16fWnO16f_int32packing, gemv_A16fWnO16f_int32packing, gemm_splitK_A16fWnO16f_int32packing] 
             self.compute_dtype = torch.float16
 
         if input_dtype == DType.BF16 and output_dtype == DType.BF16:
@@ -261,7 +247,10 @@ def __init__(
                 ),
             )
 
-        self.forward = self.forward_auto
+        if(exhaustive):
+            self.forward = self.forward_auto_with_warmup
+        else:
+            self.forward = self.forward_auto_no_warmup
 
     # Pack data, adapted from: following the same logic as: https://github.com/LeiWang1999/AutoGPTQ.tvm/blob/dcd135b9784b9f98235fc91467fe3c3c8afa34fc/auto_gptq/nn_modules/qlinear_triton.py#L413-L419
     def pack(self, W_q, scales, zeros, bias=None):
@@ -290,20 +279,24 @@ def warmup(self, signature, args):
         global GEMLITE_TRITON_CACHE
         t = [np.inf] * len(self.kernels)
         for i, _kernel in enumerate(self.kernels):
-            if signature[0] >= 8 and _kernel.matmul_type == "GEMV": #skip gemvs for larger batch-sizes
+            if signature[0] > 1 and _kernel.matmul_type == "GEMV": #skip gemvs for larger batch-sizes
                 pass 
+            if signature[0] > 32 and _kernel.matmul_type == "GEMM_SPLITK": #skip SPLIT_K for larger batch-
+                pass
+            if signature[0] < 16  and _kernel.matmul_type == "GEMM": #skip GEMM for smaller matrices
+                pass  
             else:
-                t[i] = eval_time(_kernel.forward, args)
+                t[i] = eval_time_for_auto_mode(_kernel.forward, args)
 
         indx = np.argmin(t)
         GEMLITE_TRITON_CACHE[signature] = {
             "forward": self.kernels[indx].forward,
             "time": t[indx],
+            "time_all": list(zip([k.matmul_type for k in self.kernels] , t))
         }
 
-    ################################################################################
-    #Main forward pass
-    def forward_auto(self, x):
+    #Exhaustive search 
+    def forward_auto_with_warmup(self, x):
         global GEMLITE_TRITON_CACHE
         out_shape = x.shape[:-1] + (self.out_features,)
         x_input = x.view(-1, x.shape[-1])
@@ -329,13 +322,13 @@ def forward_auto(self, x):
             out += self.bias
         return out
 
-    # def forward_auto(self, x):
-    #     if(x.view(-1, x.shape[-1]).shape[0] == 1):
-    #         return self.forward_manual(x, matmul_type='GEMV') #GEMV / GEMM_SPLITK 
-    #     else:
-    #         return self.forward_manual(x, matmul_type='GEMM')
-    #############################################################
-
+    def forward_auto_no_warmup(self, x):
+        if(x.view(-1, x.shape[-1]).shape[0] <= 16):
+            out = self.forward_manual(x, matmul_type='GEMM_SPLITK') #GEMV / GEMM_SPLITK 
+        else:
+            out = self.forward_manual(x, matmul_type='GEMM')
+        return out
+    
     def forward_manual(self, x, matmul_type="GEMM"):
         out_shape = x.shape[:-1] + (self.out_features,)
 
diff --git a/gemlite/triton_kernels/__init__.py b/gemlite/triton_kernels/__init__.py
@@ -1,4 +1,5 @@
 from .gemm_A16fWnO16f_int32packing import gemm_A16fWnO16f_int32packing
 from .gemv_A16fWnO16f_int32packing import gemv_A16fWnO16f_int32packing
+from .gemm_splitK_A16fWnO16f_int32packing import gemm_splitK_A16fWnO16f_int32packing
 
-__all__ = ["gemm_A16fWnO16f_int32packing", "gemv_A16fWnO16f_int32packing"]
+__all__ = ["gemm_A16fWnO16f_int32packing", "gemv_A16fWnO16f_int32packing", "gemm_splitK_A16fWnO16f_int32packing"]
diff --git a/gemlite/triton_kernels/gemm_splitK_A16fWnO16f_int32packing.py b/gemlite/triton_kernels/gemm_splitK_A16fWnO16f_int32packing.py
@@ -0,0 +1,255 @@
+# Written by Dr. Hicham Badri @Mobius Labs GmbH - 2024
+#********************************************************
+import torch, math
+from torch import Tensor
+import triton
+import triton.language as tl
+
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+
+def is_divisible(dividend, divisor):
+    return dividend % divisor == 0
+
+def kernel_config_pruner(configs, nargs, **kwargs):
+    m = nargs['M'] 
+    n = nargs['N'] 
+    k = nargs['K'] 
+    g = nargs['group_size']
+    
+    used = set()
+    for config in configs:
+        group_size_m = config.kwargs['GROUP_SIZE_M']
+        block_size_m = config.kwargs['BLOCK_SIZE_M'] #min(m, config.kwargs['BLOCK_SIZE_M'])
+        block_size_n = config.kwargs['BLOCK_SIZE_N'] #min(n, config.kwargs['BLOCK_SIZE_N'])
+        block_size_k = config.kwargs['BLOCK_SIZE_K'] #min(k, config.kwargs['BLOCK_SIZE_K'])
+        split_k      = config.kwargs['SPLIT_K']
+
+        #Constraints
+        #BLOCK_SIZE_K >= group_size
+        block_size_k = min(block_size_k, g)
+        #K needs to be devisible by BLOCK_SIZE_K * SPLIT_K 
+        if(not is_divisible(k, block_size_k * split_k)):
+            continue
+
+        A_load_order      = config.kwargs['A_load_order']
+        meta_evict_policy = config.kwargs['meta_evict_policy']
+        atomic_mode       = config.kwargs['atomic_mode']
+
+        _key = (block_size_m, block_size_n, block_size_k, group_size_m, split_k, 
+                A_load_order, meta_evict_policy, atomic_mode,
+                config.num_stages, config.num_warps,
+                )
+        
+        if _key in used:
+            continue
+
+        used.add(_key)
+        yield triton.Config(
+            {
+                'BLOCK_SIZE_M': block_size_m,
+                'BLOCK_SIZE_N': block_size_n,
+                'BLOCK_SIZE_K': block_size_k,
+                'GROUP_SIZE_M': group_size_m,
+                'SPLIT_K'     : split_k,
+
+                'A_load_order'      : A_load_order,
+                'meta_evict_policy' : meta_evict_policy,
+                'atomic_mode'       : atomic_mode,
+            },
+            num_stages=config.num_stages,
+            num_warps=config.num_warps,
+            pre_hook=config.pre_hook,
+        )
+
+
+def get_gemm_config():
+    #Tuned on 4090 RTX
+    _configs = []
+    for _M in [16]: #This is fixed to 16 for skinny matrices
+        for _N in [32, 64]:
+            for _K in [32, 64, 128]: #[128], group_size >= 128
+                for _w in [4]: #[4] 
+                    for _s in [2, 3]: #[2, 3] #
+                        for _sK in [2, 4, 8]: #[2, 4, 8]
+                            for _a_load_order in [1, 2, 3]: #[1, 2, 3] - [1]: default 4090
+                                for _meta_evict_policy in ['']: #[', 'evict_last'] - ['']: default 4090
+                                    for _atomic_mode in ['release', 'relaxed']: #['release', 'relaxed']:
+                                        _configs.append(
+                                                triton.Config(
+                                                    {'BLOCK_SIZE_M': _M, 'BLOCK_SIZE_N': _N, 'BLOCK_SIZE_K': _K, 
+                                                    'GROUP_SIZE_M': 8, 'SPLIT_K': _sK,
+                                                    'A_load_order': _a_load_order, 'meta_evict_policy': _meta_evict_policy, 'atomic_mode': _atomic_mode,
+                                                    }, 
+                                                    num_stages=_s, num_warps=_w,
+                                                    pre_hook=init_to_zero("c_ptr"),
+                                                    )
+                                                )
+    return _configs
+
+
+
+#@triton.heuristics(values={'CLOSEST_M': lambda args: 2 ** int(math.ceil(math.log2(args['M'])))})
+@triton.autotune(
+    configs = get_gemm_config(),
+    key=['M', 'N', 'K', 'group_size', 'elements_per_sample'],
+    prune_configs_by={
+        'early_config_prune': kernel_config_pruner,
+    },
+    warmup=200, 
+    rep=50, #20 for faster tuning 
+)
+
+@triton.jit
+def gemm_splitK_A16fWnO16f_int32packing_kernel(
+    a_ptr, b_ptr, c_ptr,
+    scales_ptr, zeros_ptr,
+    M, N, K, 
+    W_nbits: tl.constexpr, group_size: tl.constexpr, unpack_mask: tl.constexpr, elements_per_sample: tl.constexpr, 
+    stride_am, stride_ak,
+    stride_bk, stride_bn,
+    stride_cm, stride_cn,
+    stride_meta_g, stride_meta_n,
+    acc_dtype: tl.constexpr,
+    ######### tuning params #########
+    #CLOSEST_M: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, SPLIT_K: tl.constexpr,
+    A_load_order: tl.constexpr, meta_evict_policy: tl.constexpr, atomic_mode: tl.constexpr,
+):
+    """
+    Based on https://github.com/foundation-model-stack/foundation-model-stack/blob/triton/triton/kernels/gptq/splitk_dequant_gemm.py
+    GEMM for C = matmul(A, dequantize(B, scales, zeros))
+    A is of shape (M, K): float16 or bfloat16
+    B is of shape (K//elements_per_sample, N): int32 as a packed matrix
+    C is of shape (M, N): float16 or bfloat16 depending on the input A
+    scales and zeros is of shape (group_size, N): float16 or bfloat16
+
+    BLOCK_SIZE_M >=16
+    BLOCK_SIZE_K * SPLIT_K <= group_size for imp1
+    BLOCK_SIZE_K == SPLIT_K for imp2 (similar to original)
+    """
+
+    pid   = tl.program_id(axis=0)
+    pid_k = tl.program_id(axis=1)
+ 
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
+
+    #Swizzle
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id         = pid // num_pid_in_group
+    first_pid_m      = group_id * GROUP_SIZE_M
+    group_size_m     = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m            = first_pid_m + (pid % group_size_m)
+    pid_n            = (pid % num_pid_in_group) // group_size_m
+
+    #Offsets
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) 
+
+    #Vectorized coalesced load
+    offs_am = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M)
+    offs_bn = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+    #Inputs
+    a_ptrs  = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  
+    a_mask  = offs_am[:, None] < M
+    b_ptrs  = b_ptr + ((offs_k[:, None] // elements_per_sample) * stride_bk + offs_bn[None, :] * stride_bn) 
+
+    #Meta data stuff
+    q_shifts    = ((offs_k % elements_per_sample) * W_nbits).to(tl.int32)[:, None]
+
+    scales_ptrs = scales_ptr + offs_bn[None, :] * stride_meta_n
+    zeros_ptrs  = zeros_ptr  + offs_bn[None, :] * stride_meta_n
+
+    stride_mul: tl.constexpr     = BLOCK_SIZE_K / group_size 
+    BLOCK_SIZE_K_P: tl.constexpr = BLOCK_SIZE_K // elements_per_sample
+    ####################################################################################
+    
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype)
+
+    for k in tl.range(0, num_pid_k, 1, num_stages=1):
+
+        b = tl.load(b_ptrs, eviction_policy='evict_first')
+
+        if(A_load_order == 1): #Early load
+            a = tl.load(a_ptrs, mask=a_mask, other=0., eviction_policy='evict_last') 
+        
+        #Meta-data loading policy
+        k_m    = ((k * SPLIT_K + pid_k) * stride_mul).to(tl.int32) 
+        scales = tl.load(scales_ptrs + k_m * stride_meta_g, eviction_policy=meta_evict_policy)
+        zeros  = tl.load(zeros_ptrs  + k_m * stride_meta_g, eviction_policy=meta_evict_policy)
+
+        if(A_load_order == 2): #Mid load
+            a = tl.load(a_ptrs, mask=a_mask, other=0., eviction_policy='evict_last')
+
+        # Unpack and dequantize
+        b = (b >> q_shifts) & unpack_mask
+        b = (b.to(scales.dtype) - zeros) * scales
+
+        if(A_load_order == 3): #Late load 
+            a = tl.load(a_ptrs, mask=a_mask, other=0., eviction_policy='evict_last')
+        
+        #Dot
+        acc = tl.dot(a, b.to(a.dtype), acc=acc, out_dtype=acc_dtype, input_precision="ieee") 
+
+        #Advance
+        a_ptrs += BLOCK_SIZE_K   * SPLIT_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K_P * SPLIT_K * stride_bk
+
+    #Output
+    #acc = acc.to(tl.float16) 
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + (offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn)
+    tl.atomic_add(c_ptrs, acc, mask=(offs_m[:, None] < M) & (offs_n[None, :] < N), sem=atomic_mode) #release / relaxed
+
+
+@torch.library.custom_op("gemlite::gemm_splitK_A16fWnO16f_int32packing_forward", mutates_args=())
+def gemm_splitK_A16fWnO16f_int32packing_forward(x: Tensor, W_q: Tensor, scales: Tensor, zeros: Tensor, 
+                                                W_nbits: int, group_size: int, unpack_mask: int, elements_per_sample: int,
+                                                acc_dtype: int,
+                                                ) -> Tensor: 
+    
+    M, K, N = x.shape[0], x.shape[1], W_q.shape[1]
+
+    #assert K == W_q.shape[0] * elements_per_sample, "Invalid Input Shapes"
+    #assert group_size >= 128, "Only group_size >= 128 is currently supported"
+
+    output = torch.empty((M, N), device=W_q.device, dtype=scales.dtype)
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), META['SPLIT_K'])
+
+    gemm_splitK_A16fWnO16f_int32packing_kernel[grid](
+        x, W_q, output,
+        scales, zeros, 
+        M, N, K,
+        W_nbits, group_size, unpack_mask, elements_per_sample,  
+        x.stride(0), x.stride(1),
+        W_q.stride(0), W_q.stride(1),
+        output.stride(0), output.stride(1),
+        scales.stride(0), scales.stride(1),
+        tl.float16 if (acc_dtype == 1) else tl.float32,
+    )
+
+    return output
+
+@torch.library.register_fake("gemlite::gemm_splitK_A16fWnO16f_int32packing_forward")
+def gemm_splitK_A16fWnO16f_int32packing_forward_fake(x: Tensor, W_q: Tensor, scales: Tensor, zeros: Tensor, 
+                                              W_nbits: int, group_size: int, unpack_mask: int, elements_per_sample: int, 
+                                              acc_dtype: int,
+                                              ) -> Tensor:
+
+    M, K, N = x.shape[0], x.shape[1], W_q.shape[1]
+    return torch.empty((M, N), device=W_q.device, dtype=scales.dtype)
+
+
+class gemm_splitK_A16fWnO16f_int32packing:
+    kernel = gemm_splitK_A16fWnO16f_int32packing_kernel
+    forward = gemm_splitK_A16fWnO16f_int32packing_forward
+    matmul_type = "GEMM_SPLITK"
+
+__all__ = ["gemm_splitK_A16fWnO16f_int32packing"]