linkedin · Tarakarevu1 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025 · May 18, 2025
diff --git a/src/liger_kernel/ops/layer_norm.py b/src/liger_kernel/ops/layer_norm.py
@@ -197,11 +197,21 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     _DW = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
     _DB = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
 
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
+    if X.device.type == "xpu":  # XPU-specific optimization
+        BLOCK_SIZE = torch.xpu.get_device_properties(X.device).max_work_group_size
+    else:
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    if X.device.type == "xpu":  # XPU-specific optimization
+        if n_cols > 65536:
+            raise RuntimeError(
+                f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
+            )
+    else:
+        if n_cols > BLOCK_SIZE:
+            raise RuntimeError(
+                f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
+            )
 
     rows_per_program = math.ceil(n_rows / sm_count)
     grid = (sm_count,)
@@ -218,7 +228,7 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
-        kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
+        kernel_args.update({"grf_mode": "large", "num_warps": 4, "num_stages": 4})
 
     _layer_norm_backward_kernel[grid](
         X,

diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
@@ -212,7 +212,11 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     dim = shape[-1]
     X = X.view(-1, dim)
     n_rows, n_cols = X.shape
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    if X.device.type == "xpu":  # XPU-specific optimization
+        BLOCK_SIZE = torch.xpu.get_device_properties(X.device).max_work_group_size
+        num_warps = 4
+    else:
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
 
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     # RSTD is to cache rstd for each row
@@ -262,8 +266,14 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
 
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    if X.device.type == "xpu":  # XPU-specific optimization
+        if n_cols > 65536:
+            raise RuntimeError(
+                "This layer norm doesn't support feature dim >= 64KB."
+            )  # TODO RuntimeError might need little more investigation in the future
+    else:
+        if n_cols > BLOCK_SIZE:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     rows_per_program = math.ceil(n_rows / sm_count)
     grid = (sm_count,)