Support noarch build variants (#190)

danieldk · web-flow · commit d9f60ff5bdc2 · 2025-12-03T15:07:37.000+01:00
* Fix issues handling CPU layers

* Support noarch build variants

This change adds support for noarch build variants. So far we have used
the universal variant for kernels that do not have any AoT-compiled code.
However, the universal variant has two important issues:

1. A kernel without AoT-compiled might still be backend-specific. E.g.
  NVIDIA CuTe-based kernels are not universal in the sense that they
  don't work on non-NVIDIA GPUs.
2. We cannot specify dependencies per backend.

To solve these issues, we introduce the noarch variants to replace
universal kernels. Noarch kernels have variants of the shape
`torch-&lt;backend&gt;` (e.g. `torch-xpu`). This resolves the issues outlined.

This change introduces support for loading noarch kernels. In the
future, we will start emitting deprecation warnings for universal
kernels (to eventually remove support).

* Fix build variant regex

* Remove outdated comment
diff --git a/src/kernels/cli.py b/src/kernels/cli.py
@@ -14,7 +14,7 @@
 from .doc import generate_readme_for_kernel
 from .wheel import build_variant_to_wheel
 
-BUILD_VARIANT_REGEX = re.compile(r"^(torch\d+\d+|torch-universal)")
+BUILD_VARIANT_REGEX = re.compile(r"^(torch\d+\d+|torch-)")
 
 
 def main():
diff --git a/src/kernels/layer/kernelize.py b/src/kernels/layer/kernelize.py
@@ -276,7 +276,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def _validate_device_type(device_type: str) -> None:
     """Validate that the device type is supported."""
-    supported_devices = {"cuda", "mps", "npu", "rocm", "xpu"}
+    supported_devices = {"cpu", "cuda", "mps", "npu", "rocm", "xpu"}
     if device_type not in supported_devices:
         raise ValueError(
             f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
diff --git a/src/kernels/layer/repos.py b/src/kernels/layer/repos.py
@@ -24,7 +24,9 @@ class DeviceRepos(ABC):
     @staticmethod
     def create_repo(device: Device) -> "DeviceRepos":
         """Create an appropriate repository set for this device type."""
-        if device.type == "cuda":
+        if device.type == "cpu":
+            return _CPURepos()
+        elif device.type == "cuda":
             return _CUDARepos()
         elif device.type == "rocm":
             return _ROCMRepos()
@@ -51,6 +53,26 @@ def insert(self, device: Device, repos: Dict[Mode, RepositoryProtocol]):
         ...
 
 
+class _CPURepos(DeviceRepos):
+    _repos: Dict[Mode, RepositoryProtocol]
+
+    def __init__(self):
+        super().__init__()
+        self._repos = {}
+
+    @property
+    def repos(
+        self,
+    ) -> Optional[Dict[Mode, RepositoryProtocol]]:
+        return self._repos
+
+    def insert(self, device: Device, repos: Dict[Mode, RepositoryProtocol]):
+        if device.type != "cpu":
+            raise ValueError(f"Device type must be 'cpu', got {device.type}")
+
+        self._repos = repos
+
+
 class _XPURepos(DeviceRepos):
     _repos: Dict[Mode, RepositoryProtocol]
 
diff --git a/src/kernels/utils.py b/src/kernels/utils.py
@@ -84,11 +84,33 @@ def build_variant() -> str:
     return f"torch{torch_version.major}{torch_version.minor}-{cxxabi}-{compute_framework}-{cpu}-{os}"
 
 
-def universal_build_variant() -> str:
+def build_variant_noarch() -> str:
+    import torch
+
+    if torch.version.cuda is not None:
+        return "torch-cuda"
+    elif torch.version.hip is not None:
+        return "torch-rocm"
+    elif torch.backends.mps.is_available():
+        return "torch-metal"
+    elif hasattr(torch.version, "xpu") and torch.version.xpu is not None:
+        return "torch-xpu"
+    elif _get_privateuse_backend_name() == "npu":
+        return "torch-npu"
+    else:
+        return "torch-cpu"
+
+
+def build_variant_universal() -> str:
     # Once we support other frameworks, detection goes here.
     return "torch-universal"
 
 
+def build_variants() -> List[str]:
+    """Return compatible build variants in preferred order."""
+    return [build_variant(), build_variant_noarch(), build_variant_universal()]
+
+
 def _import_from_path(module_name: str, variant_path: Path) -> ModuleType:
     metadata_path = variant_path / "metadata.json"
     if metadata_path.exists():
@@ -146,13 +168,12 @@ def install_kernel(
         `Tuple[str, Path]`: A tuple containing the package name and the path to the variant directory.
     """
     package_name = package_name_from_repo_id(repo_id)
-    variant = build_variant()
-    universal_variant = universal_build_variant()
+    allow_patterns = [f"build/{variant}/*" for variant in build_variants()]
     user_agent = _get_user_agent(user_agent=user_agent)
     repo_path = Path(
         snapshot_download(
             repo_id,
-            allow_patterns=[f"build/{variant}/*", f"build/{universal_variant}/*"],
+            allow_patterns=allow_patterns,
             cache_dir=CACHE_DIR,
             revision=revision,
             local_files_only=local_files_only,
@@ -173,23 +194,22 @@ def _find_kernel_in_repo_path(
     package_name: str,
     variant_locks: Optional[Dict[str, VariantLock]] = None,
 ) -> Tuple[str, Path]:
-    specific_variant = build_variant()
-    universal_variant = universal_build_variant()
-
-    specific_variant_path = repo_path / "build" / specific_variant
-    universal_variant_path = repo_path / "build" / universal_variant
-
-    if specific_variant_path.exists():
-        variant = specific_variant
-        variant_path = specific_variant_path
-    elif universal_variant_path.exists():
-        variant = universal_variant
-        variant_path = universal_variant_path
-    else:
+    variants = build_variants()
+    variant = None
+    variant_path = None
+    for candidate_variant in variants:
+        variant_path = repo_path / "build" / candidate_variant
+        if variant_path.exists():
+            variant = candidate_variant
+            break
+
+    if variant is None:
         raise FileNotFoundError(
-            f"Kernel at path `{repo_path}` does not have one of build variants: {specific_variant}, {universal_variant}"
+            f"Kernel at path `{repo_path}` does not have one of build variants: {', '.join(variants)}"
         )
 
+    assert variant_path is not None
+
     if variant_locks is not None:
         variant_lock = variant_locks.get(variant)
         if variant_lock is None:
@@ -295,13 +315,9 @@ def get_local_kernel(repo_path: Path, package_name: str) -> ModuleType:
     Returns:
         `ModuleType`: The imported kernel module.
     """
-    variant = build_variant()
-    universal_variant = universal_build_variant()
-
     # Presume we were given the top level path of the kernel repository.
     for base_path in [repo_path, repo_path / "build"]:
-        # Prefer the universal variant if it exists.
-        for v in [universal_variant, variant]:
+        for v in build_variants():
             variant_path = base_path / v
             if variant_path.exists():
                 return _import_from_path(package_name, variant_path)
@@ -337,9 +353,8 @@ def has_kernel(
 
     package_name = package_name_from_repo_id(repo_id)
     variant = build_variant()
-    universal_variant = universal_build_variant()
 
-    for variant in [universal_variant, variant]:
+    for variant in build_variants():
         for init_file in ["__init__.py", f"{package_name}/__init__.py"]:
             if file_exists(
                 repo_id,
@@ -379,13 +394,11 @@ def load_kernel(repo_id: str, *, lockfile: Optional[Path] = None) -> ModuleType:
 
     package_name = package_name_from_repo_id(repo_id)
 
-    variant = build_variant()
-    universal_variant = universal_build_variant()
-
+    allow_patterns = [f"build/{variant}/*" for variant in build_variants()]
     repo_path = Path(
         snapshot_download(
             repo_id,
-            allow_patterns=[f"build/{variant}/*", f"build/{universal_variant}/*"],
+            allow_patterns=allow_patterns,
             cache_dir=CACHE_DIR,
             revision=locked_sha,
             local_files_only=True,
@@ -399,7 +412,7 @@ def load_kernel(repo_id: str, *, lockfile: Optional[Path] = None) -> ModuleType:
         return _import_from_path(package_name, variant_path)
     except FileNotFoundError:
         raise FileNotFoundError(
-            f"Locked kernel `{repo_id}` does not have build `{variant}` or was not downloaded with `kernels download <project>`"
+            f"Locked kernel `{repo_id}` does not have applicable variant or was not downloaded with `kernels download <project>`"
         )
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -40,7 +40,7 @@ def device():
     elif _get_privateuse_backend_name() == "npu":
         return "npu"
 
-    pytest.skip("No CUDA, NPU or XPU")
+    return "cpu"
 
 
 def pytest_runtest_setup(item):
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -163,6 +163,13 @@ def test_universal_kernel(universal_kernel):
     torch.testing.assert_close(out, out_check, rtol=1e-1, atol=1e-1)
 
 
+def test_noarch_kernel(device):
+    supported_devices = ["cpu", "cuda", "xpu"]
+    if device not in supported_devices:
+        pytest.skip(f"Device is not one of: {','.join(supported_devices)}")
+    get_kernel("kernels-test/silu-and-mul-noarch")
+
+
 @pytest.mark.parametrize(
     "repo_revision",
     [
diff --git a/tests/test_layer.py b/tests/test_layer.py
@@ -23,7 +23,6 @@
     _validate_layer,
 )
 from kernels.utils import (
-    _get_privateuse_backend_name,
     install_kernel,
 )
 
@@ -250,16 +249,13 @@ def test_hub_forward_npu():
     assert silu_and_mul_with_kernel.n_calls == 0
 
 
-@pytest.mark.skipif(
-    hasattr(torch, "xpu") and getattr(torch.xpu, "is_available", lambda: False)(),
-    reason="Skip on xpu devices",
-)
-@pytest.mark.skipif(
-    _get_privateuse_backend_name() == "npu",
-    reason="Skip on npu devices",
-)
-def test_rocm_kernel_mapping():
+def test_rocm_kernel_mapping(device):
     """Test that ROCm shorthand device mapping works correctly."""
+
+    # Lookup uses the GPU capability, so it fails for non-ROCm/CUDA.
+    if device not in ["cuda", "rocm"]:
+        pytest.skip("Test only applicable to CUDA and ROCM devices")
+
     kernel_layer_mapping = {
         "SiluAndMul": {
             "rocm": LayerRepository(