FlagOpen
diff --git a/‎src/flag_gems/ops/argmax.py
+10-4 b/‎src/flag_gems/ops/argmax.py
+10-4
diff --git a/‎src/flag_gems/ops/argmin.py
+12-27 b/‎src/flag_gems/ops/argmin.py
+12-27
diff --git a/‎src/flag_gems/ops/cummin.py
+5-2 b/‎src/flag_gems/ops/cummin.py
+5-2
diff --git a/‎src/flag_gems/ops/max.py
+15-19 b/‎src/flag_gems/ops/max.py
+15-19
@@ -9,6 +9,7 @@
 from ..runtime import torch_device_fn
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
+from ..utils.limits import get_dtype_min
 
 
 @libentry()
@@ -24,7 +25,8 @@ def argmax_kernel_1(
     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     inp_ptrs = inp + offset
     mask = offset < M
-    inp_val = tl.load(inp_ptrs, mask=mask, other=-float("inf"))
+    min_value = get_dtype_min(inp.type.element_ty)
+    inp_val = tl.load(inp_ptrs, mask=mask, other=min_value)
     max_val, max_index = tl.max(inp_val, axis=0, return_indices=True)
     max_index = max_index + pid * BLOCK_SIZE
     mid_value_ptr = mid_value + pid
@@ -39,7 +41,8 @@ def argmax_kernel_2(mid_value, mid_index, out, mid_size, BLOCK_MID: tl.constexpr
     offset = tl.arange(0, BLOCK_MID)
     mid_ptrs = mid_value + offset
     mask = offset < mid_size
-    mid_val = tl.load(mid_ptrs, mask=mask, other=-float("inf"))
+    min_value = get_dtype_min(mid_value.type.element_ty)
+    mid_val = tl.load(mid_ptrs, mask=mask, other=min_value)
     index_val = tl.argmax(mid_val, axis=0)
     mid_index_ptrs = mid_index + index_val
     out_val = tl.load(mid_index_ptrs)
@@ -63,14 +66,17 @@ def argmax_kernel(
     pid_k = tle.program_id(1)
     m_offset = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
 
-    max_values = tl.full([BLOCK_M], dtype=tl.float32, value=float("-inf"))
+    dtype = inp.type.element_ty
+    acc_type = tl.float32 if dtype is tl.bfloat16 else dtype
+    min_value = get_dtype_min(dtype)
+    max_values = tl.full([BLOCK_M], dtype=acc_type, value=min_value)
     argmax_values = tl.full([BLOCK_M], dtype=tl.int64, value=0)
     for start_n in range(0, N, BLOCK_N):
         n_offset = start_n + tl.arange(0, BLOCK_N)
         offset = m_offset[:, None] * N * K + n_offset[None, :] * K + pid_k
         mask = m_offset[:, None] < M and n_offset[None, :] < N
         inp_ptrs = inp + offset
-        inp_vals = tl.load(inp_ptrs, mask=mask, other=-float("inf")).to(tl.float32)
+        inp_vals = tl.load(inp_ptrs, mask=mask, other=min_value)
         local_max, local_argmax = tl.max(
             inp_vals, 1, return_indices=True, return_indices_tie_break_left=True
         )
 
@@ -9,14 +9,7 @@
 from ..runtime import torch_device_fn
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
-
-torch_dtype_to_tl_dtype_and_max_value = {
-    torch.int16: (tl.int16, torch.iinfo(torch.int16).max),
-    torch.int32: (tl.int32, torch.iinfo(torch.int32).max),
-    torch.float16: (tl.float16, torch.finfo(torch.float16).max),
-    torch.float32: (tl.float32, torch.finfo(torch.float32).max),
-    torch.bfloat16: (tl.float32, torch.finfo(torch.float32).max),
-}
+from ..utils.limits import get_dtype_max
 
 
 @libentry()
@@ -27,13 +20,14 @@ def argmin_kernel_1(
     mid_index,
     M,
     BLOCK_SIZE: tl.constexpr,
-    dtype_max_value: tl.constexpr,
 ):
     pid = tle.program_id(0)
     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     inp_ptrs = inp + offset
     mask = offset < M
-    inp_val = tl.load(inp_ptrs, mask=mask, other=dtype_max_value)
+
+    max_value = get_dtype_max(inp.type.element_ty)
+    inp_val = tl.load(inp_ptrs, mask=mask, other=max_value)
     min_val, min_index = tl.min(inp_val, axis=0, return_indices=True)
     min_index = min_index + pid * BLOCK_SIZE
     mid_value_ptr = mid_value + pid
@@ -50,12 +44,12 @@ def argmin_kernel_2(
     out,
     mid_size,
     BLOCK_MID: tl.constexpr,
-    dtype_max_value: tl.constexpr,
 ):
     offset = tl.arange(0, BLOCK_MID)
     mid_ptrs = mid_value + offset
     mask = offset < mid_size
-    mid_val = tl.load(mid_ptrs, mask=mask, other=dtype_max_value)
+    max_value = get_dtype_max(mid_value.type.element_ty)
+    mid_val = tl.load(mid_ptrs, mask=mask, other=max_value)
     index_val = tl.argmin(mid_val, axis=0)
     mid_index_ptrs = mid_index + index_val
     out_val = tl.load(mid_index_ptrs)
@@ -75,8 +69,6 @@ def argmin_kernel(
     M,
     N,
     K,
-    tl_dtype: tl.constexpr,
-    dtype_max_value: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
@@ -85,18 +77,18 @@ def argmin_kernel(
     pid_k = tle.program_id(1)
     m_offset = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
 
-    # min_values = tl.full([BLOCK_M], dtype=tl.float32, value=float("inf"))
-    if tl_dtype is tl.int16:
-        tl_dtype = tl.int32
-    min_values = tl.full([BLOCK_M], dtype=tl_dtype, value=dtype_max_value)
+    dtype = inp.type.element_ty
+    acc_type = tl.float32 if dtype is tl.bfloat16 else dtype
+    max_value = get_dtype_max(dtype)
+    min_values = tl.full([BLOCK_M], dtype=acc_type, value=max_value)
     argmin_values = tl.full([BLOCK_M], dtype=tl.int64, value=0)
     for start_n in range(0, N, BLOCK_N):
         n_offset = start_n + tl.arange(0, BLOCK_N)
         offset = m_offset[:, None] * N * K + n_offset[None, :] * K + pid_k
         mask = m_offset[:, None] < M and n_offset[None, :] < N
         inp_ptrs = inp + offset
-        # inp_vals = tl.load(inp_ptrs, mask=mask, other=float("inf"))
-        inp_vals = tl.load(inp_ptrs, mask=mask, other=dtype_max_value)
+        inp_vals = tl.load(inp_ptrs, mask=mask, other=max_value)
+        # tl.bfloat is promoted to tl.float32 by tl.min
         local_min, local_argmin = tl.min(
             inp_vals, 1, return_indices=True, return_indices_tie_break_left=True
         )
@@ -132,23 +124,20 @@ def argmin(inp, dim=None, keepdim=False, *, dtype=None):
         else:
             out = torch.empty([], dtype=torch.int64, device=inp.device)
 
-        tl_dtype, dtype_max_value = torch_dtype_to_tl_dtype_and_max_value[inp.dtype]
         with torch_device_fn.device(inp.device):
             argmin_kernel_1[(mid_size, 1, 1)](
                 inp,
                 mid_value,
                 mid_index,
                 M,
                 block_size,
-                dtype_max_value,
             )
             argmin_kernel_2[(1, 1, 1)](
                 mid_value,
                 mid_index,
                 out,
                 mid_size,
                 block_mid,
-                dtype_max_value,
             )
         return out
     else:
@@ -167,8 +156,6 @@ def argmin(inp, dim=None, keepdim=False, *, dtype=None):
         if not keepdim:
             out_index = torch.squeeze(out_index, dim)
 
-        tl_dtype, dtype_max_value = torch_dtype_to_tl_dtype_and_max_value[inp.dtype]
-
         grid = lambda meta: (
             triton.cdiv(M, meta["BLOCK_M"]),
             K,
@@ -180,8 +167,6 @@ def argmin(inp, dim=None, keepdim=False, *, dtype=None):
                 M,
                 N,
                 K,
-                tl_dtype,
-                dtype_max_value,
             )
 
         return out_index
@@ -8,6 +8,7 @@
 from ..runtime import torch_device_fn
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
+from ..utils.limits import get_dtype_max
 
 
 @triton.jit
@@ -76,8 +77,9 @@ def scan_part_min_kernel(
     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     mask = offset < n_elements
 
+    max_value = get_dtype_max(inp.type.element_ty)
     inp_ptrs = inp + offset
-    inp_vals = tl.load(inp_ptrs, mask=mask, other=float("inf"))
+    inp_vals = tl.load(inp_ptrs, mask=mask, other=max_value)
     if (
         tl.constexpr(inp_vals.dtype.is_int64())
         or tl.constexpr(inp_vals.dtype.is_uint64())
@@ -169,7 +171,8 @@ def scan_part_min_abc_kernel(
 
     mask = b_idx < B
     inp_ptrs = inp + offset
-    inp_vals = tl.load(inp_ptrs, mask=mask, other=float("inf"))
+    max_value = get_dtype_max(inp.type.element_ty)
+    inp_vals = tl.load(inp_ptrs, mask=mask, other=max_value)
     if (
         tl.constexpr(inp_vals.dtype.is_int64())
         or tl.constexpr(inp_vals.dtype.is_uint64())
 
@@ -10,6 +10,7 @@
 from ..runtime import torch_device_fn
 from ..utils import libentry
 from ..utils import triton_lang_extension as tle
+from ..utils.limits import get_dtype_min
 
 
 @libentry()
@@ -18,26 +19,27 @@ def max_kernel_1(
     inp,
     mid,
     M,
-    DTYPE_MIN,
     BLOCK_SIZE: tl.constexpr,
 ):
     pid = tle.program_id(0)
     offset = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
     inp_ptrs = inp + offset
     mask = offset < M
-    inp_val = tl.load(inp_ptrs, mask=mask, other=DTYPE_MIN)
+    min_value = get_dtype_min(inp.type.element_ty)
+    inp_val = tl.load(inp_ptrs, mask=mask, other=min_value)
     max_val = tl.max(inp_val)
     mid_ptr = mid + pid
     tl.store(mid_ptr, max_val)
 
 
 @libentry()
 @triton.jit
-def max_kernel_2(mid, out, mid_size, DTYPE_MIN, BLOCK_MID: tl.constexpr):
+def max_kernel_2(mid, out, mid_size, BLOCK_MID: tl.constexpr):
     offset = tl.arange(0, BLOCK_MID)
     mid_ptrs = mid + offset
     mask = offset < mid_size
-    mid_val = tl.load(mid_ptrs, mask=mask, other=DTYPE_MIN)
+    min_value = get_dtype_min(mid.type.element_ty)
+    mid_val = tl.load(mid_ptrs, mask=mask, other=min_value)
     max_val = tl.max(mid_val)
     tl.store(out, max_val)
 
@@ -62,23 +64,26 @@ def max_kernel(
     M,
     N,
     K,
-    DTYPE_MIN,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
     # set offset
     pid_m = tle.program_id(0)
     pid_k = tle.program_id(1)
     m_offset = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    result_value = tl.full([BLOCK_M], value=-float("inf"), dtype=tl.float32)
+
+    dtype = inp.type.element_ty
+    acc_type = tl.float32 if dtype is tl.bfloat16 else dtype
+    min_value = get_dtype_min(dtype)
+    result_value = tl.full([BLOCK_M], value=min_value, dtype=acc_type)
     result_index = tl.zeros([BLOCK_M], dtype=tl.int64)
     for i in range(0, N, BLOCK_N):
         n_offset = i + tl.arange(0, BLOCK_N)
         offset = m_offset[:, None] * N * K + n_offset[None, :] * K + pid_k
         # set mask
         mask = m_offset[:, None] < M and n_offset[None, :] < N
         inp_ptrs = inp + offset
-        inp_vals = tl.load(inp_ptrs, mask=mask, other=DTYPE_MIN)
+        inp_vals = tl.load(inp_ptrs, mask=mask, other=min_value)
         max_value, max_index = tl.max(inp_vals, axis=1, return_indices=True)
         update_mask = max_value > result_value
         result_value = tl.where(update_mask, max_value, result_value)
@@ -104,13 +109,9 @@ def max(inp):
     mid = torch.empty((mid_size,), dtype=dtype, device=inp.device)
     out = torch.empty([], dtype=dtype, device=inp.device)
 
-    if torch.is_floating_point(inp):
-        dtype_min = torch.finfo(inp.dtype).min
-    else:
-        dtype_min = torch.iinfo(inp.dtype).min
     with torch_device_fn.device(inp.device):
-        max_kernel_1[(mid_size, 1, 1)](inp, mid, M, dtype_min, block_size)
-        max_kernel_2[(1, 1, 1)](mid, out, mid_size, dtype_min, block_mid)
+        max_kernel_1[(mid_size, 1, 1)](inp, mid, M, block_size)
+        max_kernel_2[(1, 1, 1)](mid, out, mid_size, block_mid)
     return out
 
 
@@ -134,17 +135,12 @@ def max_dim(inp, dim=None, keepdim=False):
         out_value = torch.squeeze(out_value, dim)
         out_index = torch.squeeze(out_index, dim)
 
-    if torch.is_floating_point(inp):
-        dtype_min = torch.finfo(inp.dtype).min
-    else:
-        dtype_min = torch.iinfo(inp.dtype).min
-
     grid = lambda meta: (
         triton.cdiv(M, meta["BLOCK_M"]),
         K,
     )
     with torch_device_fn.device(inp.device):
-        max_kernel[grid](inp, out_value, out_index, M, N, K, dtype_min)
+        max_kernel[grid](inp, out_value, out_index, M, N, K)
     Max_out = namedtuple("max", ["values", "indices"])
     out = Max_out(values=out_value, indices=out_index)
     return out