open-lm-engine · mayank31398 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/fma/__init__.py b/fma/__init__.py
@@ -10,7 +10,7 @@
     get_cartesian_product_cutotune_configs,
     get_cutotune_cache,
 )
-from .enums import KernelBackend
+from .enums import KernelBackend, force_kernel_backend
 from .functional import (
     bmm,
     continuous_count,

diff --git a/fma/enums.py b/fma/enums.py
@@ -2,20 +2,50 @@
 # Copyright (c) 2025, Mayank Mishra
 # **************************************************
 
+from __future__ import annotations
+
+from contextlib import contextmanager
 from enum import Enum
 
-from .cutotune import CutoTuneParameter
+import torch
+
+
+_IS_ROCM_AVAILABLE = torch.version.hip is not None
+_FORCED_KERNEL_BACKEND = None
+
+
+@contextmanager
+def force_kernel_backend(kernel_backend: KernelBackend):
+    global _FORCED_KERNEL_BACKEND
+
+    original_value = _FORCED_KERNEL_BACKEND
+    _FORCED_KERNEL_BACKEND = kernel_backend
+
+    yield
+
+    _FORCED_KERNEL_BACKEND = original_value
 
 
 class KernelBackend(Enum):
     cuda = "cuda"
-    torch = "torch"
+    rocm = "rocm"
+    tpu = "pallas"
+    # for triton compatible accelerators
     triton = "triton"
+    torch = "torch"
 
+    @staticmethod
+    def get_kernel_backend_from_device(x: torch.Tensor) -> KernelBackend:
+        global _FORCED_KERNEL_BACKEND
 
-def is_cuda_kernel_backend_allowed(kernel_backend: KernelBackend) -> bool:
-    return isinstance(kernel_backend, CutoTuneParameter) or kernel_backend in [None, KernelBackend.cuda]
+        if _FORCED_KERNEL_BACKEND is not None:
+            return _FORCED_KERNEL_BACKEND
 
+        device_type = x.device.type
 
-def is_triton_kernel_backend_allowed(kernel_backend: KernelBackend) -> bool:
-    return isinstance(kernel_backend, CutoTuneParameter) or kernel_backend in [None, KernelBackend.triton]
+        if device_type == "cuda":
+            return KernelBackend.rocm if _IS_ROCM_AVAILABLE else KernelBackend.cuda
+        elif device_type == "xla":
+            return KernelBackend.tpu
+        else:
+            return KernelBackend.triton
diff --git a/fma/functional/bmm/__init__.py b/fma/functional/bmm/__init__.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from ...cutotune import CutoTuneParameter
 from ...enums import KernelBackend
 from .triton_implementation import bmm_triton
 
@@ -17,8 +16,6 @@ def bmm(
     is_B_transposed: bool = False,
     alpha: float = 1,
     beta: float = 1,
-    *,
-    kernel_backend: KernelBackend | CutoTuneParameter = KernelBackend.triton,
 ) -> torch.Tensor:
     """computes `alpha` * (`A` @ `B`) + `beta` * `C`
 
@@ -30,8 +27,6 @@ def bmm(
         is_B_transposed (bool, optional): whether B has shape N x K. Defaults to False.
         alpha (float, optional): alpha. Defaults to 1.
         beta (float, optional): beta. Defaults to 1.
-        kernel_backend (KernelBackend | CutoTuneParameter, optional): kernel backend to prioritize.
-            Defaults to KernelBackend.triton.
 
     Raises:
         ValueError: if unexpected `kernel_backend` is passed
@@ -56,6 +51,8 @@ def bmm(
         assert C is not None
         assert C.size() == (L, M, N)
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(A)
+
     if kernel_backend == KernelBackend.torch:
         if is_A_transposed:
             A = A.transpose(1, 2)
@@ -69,7 +66,7 @@ def bmm(
                 D = alpha * D
         else:
             D = torch.baddbmm(C, A, B, alpha=alpha, beta=beta)
-    elif kernel_backend == KernelBackend.triton:
+    elif kernel_backend in [KernelBackend.cuda, KernelBackend.triton]:
         D = torch.empty(L, M, N, dtype=A.dtype, device=A.device)
 
         bmm_triton(

diff --git a/fma/functional/continuous_count/__init__.py b/fma/functional/continuous_count/__init__.py
@@ -9,18 +9,14 @@
 
 
 @torch.no_grad()
-def continuous_count(
-    x: torch.Tensor, size: int, *, kernel_backend: KernelBackend = KernelBackend.cuda
-) -> torch.Tensor:
+def continuous_count(x: torch.Tensor, size: int) -> torch.Tensor:
     """counts the number of occurances of the values [0, 1, ..., `size`) in the input tensor (`size` is excluded).
         NOTE: the user is responsible for ensuring that the values lie in the valid range, any values outside this
         range are ignored and not counted.
 
     Args:
         x (torch.Tensor): input tensor
         size (int): values [0, 1, ..., `size`) are counted (`size` is excluded)
-        kernel_backend (KernelBackend, optional): kernel backend to prioritize.
-            Defaults to KernelBackend.cuda.
 
     Returns:
         torch.Tensor: output tensor
@@ -32,9 +28,11 @@ def continuous_count(
     assert x.dim() == 1, "x should be 1-dimensional"
     assert x.dtype in [torch.int32, torch.long]
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(x)
+
     if kernel_backend == KernelBackend.torch:
         output = x.bincount(minlength=size).to(torch.uint32)
-    elif kernel_backend == KernelBackend.cuda:
+    elif kernel_backend in [KernelBackend.cuda, KernelBackend.triton]:
         output = torch.empty(size, dtype=torch.uint32, device=x.device)
         continuous_count_cuda(x=x, output=output, E=size, THREAD_BLOCK_CLUSTER_SIZE=1, BLOCK_SIZE=1024)
     else:

diff --git a/fma/functional/cross_entropy/__init__.py b/fma/functional/cross_entropy/__init__.py
@@ -35,12 +35,7 @@ def backward(ctx, output_grad: torch.Tensor) -> tuple[torch.Tensor | None]:
 
 
 def cross_entropy(
-    x: torch.Tensor,
-    labels: torch.Tensor,
-    reduction: str = "mean",
-    logits_multiplier: float | None = None,
-    *,
-    kernel_backend: KernelBackend = KernelBackend.triton,
+    x: torch.Tensor, labels: torch.Tensor, reduction: str = "mean", logits_multiplier: float | None = None
 ) -> torch.Tensor:
     """compute cross entropy loss
 
@@ -50,8 +45,6 @@ def cross_entropy(
         reduction (str, optional): reduction should be either sum or mean. Defaults to "mean".
         logits_multiplier (float | None, optional): logits multiplier pre-multiplies logits, None implies 1.
             Defaults to None.
-        kernel_backend (KernelBackend, optional): kernel backend to prioritize.
-            Defaults to KernelBackend.triton.
 
     Returns:
         torch.Tensor: loss
@@ -64,6 +57,8 @@ def cross_entropy(
         labels.size(0) == get_num_elements_and_hidden_size(x)[0]
     ), "x and labels have different number of elements along batch dimension"
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(x)
+
     if kernel_backend == KernelBackend.torch:
         x = x.float()
 
@@ -72,6 +67,7 @@ def cross_entropy(
 
         x = F.cross_entropy(x, labels, reduction=reduction)
     else:
+        assert kernel_backend in [KernelBackend.cuda, KernelBackend.triton]
         x = _CrossEntropy.apply(x, labels, reduction, logits_multiplier)
 
     return x
diff --git a/fma/functional/fused_linear_cross_entropy.py b/fma/functional/fused_linear_cross_entropy.py
@@ -83,7 +83,6 @@ def fused_linear_cross_entropy(
     labels: torch.Tensor,
     reduction: str = "mean",
     logits_multiplier: float | None = None,
-    kernel_backend: KernelBackend | CutoTuneParameter = KernelBackend.triton,
 ) -> torch.Tensor:
     """compute cross entropy loss without materializing the full output logits matrix
 
@@ -105,16 +104,13 @@ def fused_linear_cross_entropy(
     assert x.size(0) == labels.size(0), "x and labels have different number of elements along dim 0"
     assert x.size(-1) == weight.size(-1)
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(x)
+
     if kernel_backend == KernelBackend.torch:
         x = F.linear(x, weight)
-        x = cross_entropy(
-            x=x,
-            labels=labels,
-            reduction=reduction,
-            logits_multiplier=logits_multiplier,
-            kernel_backend=kernel_backend,
-        )
+        x = cross_entropy(x=x, labels=labels, reduction=reduction, logits_multiplier=logits_multiplier)
     else:
+        assert kernel_backend in [KernelBackend.cuda, KernelBackend.triton]
         x = _FusedLinearCrossEntropy.apply(x, weight, labels, reduction, logits_multiplier)
 
     return x
diff --git a/fma/functional/fused_residual_add_rmsnorm/__init__.py b/fma/functional/fused_residual_add_rmsnorm/__init__.py
@@ -109,7 +109,6 @@ def fused_residual_add_rmsnorm(
     multiplier: float | None = None,
     memory_efficient: bool = False,
     deterministic: bool = False,
-    kernel_backend: KernelBackend | CutoTuneParameter = KernelBackend.triton,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """fused residual add RMSNorm computation
 
@@ -132,6 +131,8 @@ def fused_residual_add_rmsnorm(
         assert weight.size(-1) == x.size(-1), "hidden size for x and weight tensor is different"
         assert weight.type() == x.type(), "tensors weight and y should have same dtype"
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(x)
+
     if kernel_backend == KernelBackend.torch:
         if multiplier not in [None, 1]:
             x = x * multiplier
@@ -142,7 +143,7 @@ def fused_residual_add_rmsnorm(
 
         x = F.rms_norm(x, normalized_shape=(x.size(-1),), weight=weight, eps=eps)
     else:
-        assert kernel_backend == KernelBackend.triton
+        assert kernel_backend in [KernelBackend.cuda, KernelBackend.triton]
         increment_counter(fused_residual_add_rmsnorm)
 
         is_flat = x.dim() == 1

diff --git a/fma/functional/grouped_gemm/__init__.py b/fma/functional/grouped_gemm/__init__.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from ...enums import KernelBackend
 from .cuda_implementation import grouped_gemm_cuda
 
 
@@ -24,6 +25,9 @@ def grouped_gemm(
     assert beta == 0
     assert C is None
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(A)
+    assert kernel_backend == KernelBackend.cuda
+
     output = torch.empty(*output_shape, device=A.device, dtype=A.dtype)
 
     grouped_gemm_cuda(

diff --git a/fma/functional/gru/__init__.py b/fma/functional/gru/__init__.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from ...cutotune import CutoTuneParameter
 from ...enums import KernelBackend
 from ...torch_math import clip_gradients, sigmoid, tanh
 from ...utils import empty_like_contiguous, zeros_like_contiguous
@@ -138,8 +137,6 @@ def gru(
     gradient_clipping: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
     max_seqlen: torch.Tensor | int | None = None,
-    *,
-    kernel_backend: KernelBackend | CutoTuneParameter = KernelBackend.triton,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """computes multihead RNN: tanh(`input_state` @ `weight` + `input`)
 
@@ -164,6 +161,8 @@ def gru(
     N, H = input.size()[-2:]
     assert weight.size() == (N, H, H)
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(input)
+
     if gradient_clipping is not None and gradient_clipping < 0:
         gradient_clipping = -gradient_clipping
 
@@ -252,6 +251,8 @@ def gru(
                 output[offset_unfinished] = new_state
                 input_state[unfinished] = new_state
     else:
+        assert kernel_backend in [KernelBackend.cuda, KernelBackend.triton]
+
         output = _GRU.apply(
             input,
             weight,

diff --git a/fma/functional/rmsnorm.py b/fma/functional/rmsnorm.py
@@ -4,8 +4,6 @@
 
 import torch
 
-from ..cutotune import CutoTuneParameter
-from ..enums import KernelBackend
 from .fused_residual_add_rmsnorm import fused_residual_add_rmsnorm
 
 
@@ -15,8 +13,6 @@ def rmsnorm(
     eps: float | None,
     memory_efficient: bool = False,
     deterministic: bool = False,
-    *,
-    kernel_backend: KernelBackend | CutoTuneParameter = KernelBackend.triton,
 ) -> torch.Tensor:
     """RMSNorm computation
 
@@ -27,8 +23,6 @@ def rmsnorm(
         memory_efficient (bool, optional): memory efficient = False caches RMSNorm's denominator in the forward.
             Defaults to False.
         deterministic (bool, optional): whether to use deterministic backward. Defaults to False.
-        kernel_backend (KernelBackend | CutoTuneParameter, optional): kernel backend to prioritize.
-            Defaults to KernelBackend.triton.
 
     Returns:
         torch.Tensor: output tensor
@@ -42,7 +36,6 @@ def rmsnorm(
         multiplier=None,
         memory_efficient=memory_efficient,
         deterministic=deterministic,
-        kernel_backend=kernel_backend,
     )
 
     return x
diff --git a/fma/functional/rnn/__init__.py b/fma/functional/rnn/__init__.py
@@ -78,8 +78,6 @@ def rnn(
     gradient_clipping: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
     max_seqlen: torch.Tensor | int | None = None,
-    *,
-    kernel_backend: KernelBackend = KernelBackend.triton,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """computes multihead RNN recurrent update over the sequence length: tanh(`input_state` @ `weight` + `input`)
 
@@ -93,7 +91,6 @@ def rnn(
             implies no clipping. Defaults to None.
         cu_seqlens (torch.Tensor | None, optional): cumulative sequence length (must contain 0 as first element). Defaults to None.
         max_seqlen (torch.Tensor | int | None, optional): max sequence length in the batch. Defaults to None.
-        kernel_backend (KernelBackend, optional): kernel backend to prioritize. Defaults to KernelBackend.triton.
 
     Returns:
         tuple[torch.Tensor, torch.Tensor]: output tensor of shape (B, S, N, H) and output state tensor of shape (B, N, H)
@@ -105,6 +102,8 @@ def rnn(
     N, H = input.size()[-2:]
     assert weight.size() == (N, H, H)
 
+    kernel_backend = KernelBackend.get_kernel_backend_from_device(input)
+
     if gradient_clipping is not None and gradient_clipping < 0:
         gradient_clipping = -gradient_clipping
 
@@ -171,6 +170,7 @@ def rnn(
                 output[offset_unfinished] = new_state
                 input_state[unfinished] = new_state
     else:
+        assert kernel_backend in [KernelBackend.cuda, KernelBackend.triton]
         output = _RNN.apply(input, weight, input_state, gradient_clipping, cu_seqlens, max_seqlen)
 
     output_state = output[:, -1] if cu_seqlens is None else output[cu_seqlens[1:] - 1]