Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Allow the configuration of plugins in the binder service. [#1480](https://github.com/kai-scheduler/KAI-Scheduler/pull/1480) - [davidLif](https://github.com/davidLif)
- Added support for configuring scheduler log level and custom scheduler args via Helm values (`scheduler.args`) [#1452](https://github.com/kai-scheduler/KAI-Scheduler/pull/1452) [dttung2905](https://github.com/dttung2905)
- Added `crdupgrader.image.registry` Helm value to override `global.registry` for the `crd-upgrader` pre-install/pre-upgrade hook image, allowing the hook image to be served from a separate mirror without redirecting all chart images. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404)
- Added an opt-in `hamicore` binder plugin (depends on `gpusharing`) to write the HAMI-core GPU memory limit (`CUDA_DEVICE_MEMORY_LIMIT`) for fractional GPU pods.

### Changed
- **Breaking:** JobSet PodGroups no longer auto-calculate `minAvailable` from `parallelism × replicas`. The default is now 1. Use the `kai.scheduler/batch-min-member` annotation to set a custom value.
Expand Down
7 changes: 6 additions & 1 deletion docs/developer/binder.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,9 @@ The current default binder plugins are:
| `volumebinding` | 300 | `bindTimeoutSeconds: "120"` | Handles Kubernetes persistent volume binding before pod bind. |
| `dynamicresources` | 200 | `bindTimeoutSeconds: "120"` | Handles Kubernetes Dynamic Resource Allocation claim binding before pod bind. |
| `gpusharing` | 100 | `cdiEnabled: "false"` | Handles fractional GPU pod mutation needed for GPU sharing. |
| `hamicore` | 50 | | Optional HAMI-core GPU virtualization for fractional GPU pods. Depends on `gpusharing`. Disabled by default. |

For operator-managed deployments, the operator sets the `gpusharing` `cdiEnabled` argument from `spec.binder.cdiEnabled`. If `spec.binder.cdiEnabled` is unset, the operator attempts to auto-detect CDI from the NVIDIA GPU Operator `ClusterPolicy`.
For operator-managed deployments, the operator sets the `gpusharing` `cdiEnabled` argument from `spec.binder.cdiEnabled`. If `spec.binder.cdiEnabled` is unset, the operator attempts to auto-detect CDI from the NVIDIA GPU Operator `ClusterPolicy`. Enable `hamicore` to opt into HAMI-core GPU memory limits.

### Config Examples

Expand Down Expand Up @@ -197,6 +198,10 @@ The dynamic resources plugin handles Kubernetes Dynamic Resource Allocation reso

The GPU sharing plugin handles fractional GPU assignments. For shared GPU allocations it creates the required GPU sharing ConfigMaps and sets the NVIDIA visible devices and GPU portion information on the target container.

#### HAMI-core Plugin

The HAMI-core plugin is disabled by default and **requires `gpusharing` to be enabled**. When enabled, it writes `CUDA_DEVICE_MEMORY_LIMIT` to the GPU sharing ConfigMap based on node label `nvidia.com/gpu.memory` and `BindRequest.spec.receivedGPU.portion`.

### Creating Custom Plugins

To create a custom binder plugin:
Expand Down
8 changes: 7 additions & 1 deletion pkg/apis/kai/v1/binder/binder.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
VolumeBindingPluginName = "volumebinding"
DynamicResourcesPluginName = "dynamicresources"
GPUSharingPluginName = "gpusharing"
HamiCorePluginName = "hamicore"

BindTimeoutSecondsArgument = "bindTimeoutSeconds"
CDIEnabledArgument = "cdiEnabled"
Expand All @@ -34,6 +35,7 @@ var defaultPluginPriorities = map[string]int{
VolumeBindingPluginName: 300,
DynamicResourcesPluginName: 200,
GPUSharingPluginName: 100,
HamiCorePluginName: 50,
}

// PluginConfig allows overriding binder plugin settings.
Expand Down Expand Up @@ -84,7 +86,7 @@ type Binder struct {

// Plugins allows overriding binder plugin configuration. Keys are plugin names.
// Built-in plugins can be disabled, reordered, or have their arguments changed.
// Built-in plugins: volumebinding, dynamicresources, gpusharing.
// Built-in plugins: volumebinding, dynamicresources, gpusharing, hamicore.
// +kubebuilder:validation:Optional
Plugins map[string]PluginConfig `json:"plugins,omitempty"`

Expand Down Expand Up @@ -196,6 +198,10 @@ func DefaultPluginsConfig(bindTimeoutSeconds int, cdiEnabled bool) map[string]Pl
CDIEnabledArgument: strconv.FormatBool(cdiEnabled),
},
},
HamiCorePluginName: {
Enabled: ptr.To(false),
Priority: ptr.To(defaultPluginPriorities[HamiCorePluginName]),
},
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/kai/v1/binder/binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ var _ = Describe("Binder", func() {
binder.SetDefaultsWhereNeeded(nil, nil)
Expect(binder.Plugins[GPUSharingPluginName].Arguments[CDIEnabledArgument]).
To(Equal(strconv.FormatBool(true)))
Expect(binder.Plugins[HamiCorePluginName].Enabled).NotTo(BeNil())
Expect(*binder.Plugins[HamiCorePluginName].Enabled).To(BeFalse())
})

It("Set Defaults With Plugin Overrides", func(ctx context.Context) {
Expand Down
7 changes: 4 additions & 3 deletions pkg/binder/common/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
package common

const (
GPUPortion = "GPU_PORTION"
ReceivedTypeFraction = "Fraction"
ReceivedTypeRegular = "Regular"
GPUPortion = "GPU_PORTION"
CudaDeviceMemoryLimit = "CUDA_DEVICE_MEMORY_LIMIT"
ReceivedTypeFraction = "Fraction"
ReceivedTypeRegular = "Regular"
)
35 changes: 35 additions & 0 deletions pkg/binder/common/gpu_access.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

Expand Down Expand Up @@ -58,6 +59,19 @@ func AddGPUSharingEnvVars(container *v1.Container, sharedGpuConfigMapName string
},
},
})

AddEnvVarToContainer(container, v1.EnvVar{
Name: CudaDeviceMemoryLimit,
ValueFrom: &v1.EnvVarSource{
ConfigMapKeyRef: &v1.ConfigMapKeySelector{
Key: CudaDeviceMemoryLimit,
LocalObjectReference: v1.LocalObjectReference{
Name: sharedGpuConfigMapName,
},
Optional: ptr.To(true),
},
},
})
}

func SetNvidiaVisibleDevices(
Expand Down Expand Up @@ -122,6 +136,27 @@ func SetGPUPortion(
return nil
}

func SetCudaDeviceMemoryLimit(
ctx context.Context, kubeClient client.Client, pod *v1.Pod, containerRef *gpusharingconfigmap.PodContainerRef,
cudaDeviceMemoryLimit string,
) error {
updateFunc := func(data map[string]string) error {
data[CudaDeviceMemoryLimit] = cudaDeviceMemoryLimit
return nil
}
capabilitiesMapName, err := gpusharingconfigmap.ExtractCapabilitiesConfigMapName(pod, containerRef)
if err != nil {
return err
}

err = UpdateConfigMapEnvironmentVariable(ctx, kubeClient, pod, capabilitiesMapName, updateFunc)
if err != nil {
return fmt.Errorf("failed to update CUDA_DEVICE_MEMORY_LIMIT value in gpu sharing configmap for pod <%s/%s>: %v",
pod.Namespace, pod.Name, err)
}
return nil
}

func UpdateConfigMapEnvironmentVariable(
ctx context.Context, kubeclient client.Client, task *v1.Pod,
configMapName string, changesFunc func(map[string]string) error,
Expand Down
1 change: 1 addition & 0 deletions pkg/binder/plugins/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const (
VolumeBindingPluginName = kaiv1binder.VolumeBindingPluginName
DynamicResourcesPluginName = kaiv1binder.DynamicResourcesPluginName
GPUSharingPluginName = kaiv1binder.GPUSharingPluginName
HamiCorePluginName = kaiv1binder.HamiCorePluginName

BindTimeoutSecondsArgument = kaiv1binder.BindTimeoutSecondsArgument
CDIEnabledArgument = kaiv1binder.CDIEnabledArgument
Expand Down
6 changes: 6 additions & 0 deletions pkg/binder/plugins/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ func TestDefaultConfig(t *testing.T) {
if got := config[GPUSharingPluginName].Arguments[CDIEnabledArgument]; got != "true" {
t.Fatalf("expected gpusharing CDI true, got %q", got)
}
if *config[HamiCorePluginName].Enabled {
t.Fatalf("expected hamicore to be disabled by default")
}
if got := len(config[HamiCorePluginName].Arguments); got != 0 {
t.Fatalf("expected hamicore to have no default arguments, got %d", got)
}

options := config.EnabledOptions()
if got := []string{options[0].Name, options[1].Name, options[2].Name}; !equalStrings(got,
Expand Down
42 changes: 42 additions & 0 deletions pkg/binder/plugins/dependencies_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2026 NVIDIA CORPORATION
// SPDX-License-Identifier: Apache-2.0

package plugins

import (
"testing"

"k8s.io/utils/ptr"
)

func TestValidateDependentPluginsHamiCoreRequiresGPUSharing(t *testing.T) {
cfg := Config{
GPUSharingPluginName: {
Enabled: ptr.To(false),
},
HamiCorePluginName: {
Enabled: ptr.To(true),
},
}

if err := validateDependentPlugins(cfg); err == nil {
t.Fatalf("expected dependency validation error")
}
}

func TestValidateDependentPluginsHamiCoreRequiresOrdering(t *testing.T) {
cfg := Config{
GPUSharingPluginName: {
Enabled: ptr.To(true),
Priority: ptr.To(50),
},
HamiCorePluginName: {
Enabled: ptr.To(true),
Priority: ptr.To(100),
},
}

if err := validateDependentPlugins(cfg); err == nil {
t.Fatalf("expected ordering validation error")
}
}
53 changes: 53 additions & 0 deletions pkg/binder/plugins/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/gpusharing"
"github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/hamicore"
k8splugins "github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/k8s-plugins"
)

Expand Down Expand Up @@ -48,9 +49,14 @@ func InitDefaultPlugins() {
RegisterPluginBuilder(VolumeBindingPluginName, newVolumeBindingPlugin)
RegisterPluginBuilder(DynamicResourcesPluginName, newDynamicResourcesPlugin)
RegisterPluginBuilder(GPUSharingPluginName, newGPUSharingPlugin)
RegisterPluginBuilder(HamiCorePluginName, newHamiCorePlugin)
}

func BuildConfiguredPlugins(buildContext PluginBuildContext, config Config) (*BinderPlugins, error) {
if err := validateDependentPlugins(config); err != nil {
return nil, err
}

binderPlugins := New()
for _, option := range config.EnabledOptions() {
builder, found := GetPluginBuilder(option.Name)
Expand Down Expand Up @@ -101,6 +107,41 @@ func newGPUSharingPlugin(buildContext PluginBuildContext, arguments map[string]s
return gpusharing.New(buildContext.KubeClient, cdiEnabled), nil
}

func newHamiCorePlugin(buildContext PluginBuildContext, _ map[string]string) (Plugin, error) {
return hamicore.New(buildContext.KubeClient), nil
}

func validateDependentPlugins(config Config) error {
hamiCoreCfg, hamiCoreFound := config[HamiCorePluginName]
if !hamiCoreFound || (hamiCoreCfg.Enabled != nil && !*hamiCoreCfg.Enabled) {
return nil
}

gpuSharingCfg, gpuSharingFound := config[GPUSharingPluginName]
if !gpuSharingFound || (gpuSharingCfg.Enabled != nil && !*gpuSharingCfg.Enabled) {
return fmt.Errorf("%q plugin requires %q plugin to be enabled", HamiCorePluginName, GPUSharingPluginName)
}

// PreBind is invoked in EnabledOptions() order (higher priority first).
// hamicore requires gpusharing to have already created the configmap.
hamiCorePri := ptrDerefInt(config[HamiCorePluginName].Priority, 0)
gpuSharingPri := ptrDerefInt(config[GPUSharingPluginName].Priority, 0)
if gpuSharingPri <= hamiCorePri {
return fmt.Errorf("%q plugin requires %q to run before it (expected %q.priority > %q.priority, got %d <= %d)",
HamiCorePluginName, GPUSharingPluginName, GPUSharingPluginName, HamiCorePluginName,
gpuSharingPri, hamiCorePri)
}

return nil
}

func ptrDerefInt(v *int, defaultValue int) int {
if v == nil {
return defaultValue
}
return *v
}

func int64Argument(arguments map[string]string, name string) (int64, error) {
value, found := arguments[name]
if !found {
Expand All @@ -124,3 +165,15 @@ func boolArgument(arguments map[string]string, name string) (bool, error) {
}
return parsed, nil
}

func boolArgumentOrDefault(arguments map[string]string, name string, defaultValue bool) (bool, error) {
value, found := arguments[name]
if !found {
return defaultValue, nil
}
parsed, err := strconv.ParseBool(value)
if err != nil {
return false, fmt.Errorf("invalid argument %q=%q: %w", name, value, err)
}
return parsed, nil
}
89 changes: 89 additions & 0 deletions pkg/binder/plugins/hamicore/hami_core.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright 2026 NVIDIA CORPORATION
// SPDX-License-Identifier: Apache-2.0

package hamicore

import (
"context"
"fmt"
"strconv"

v1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kai-scheduler/KAI-scheduler/pkg/apis/scheduling/v1alpha2"
"github.com/kai-scheduler/KAI-scheduler/pkg/binder/common"
"github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/state"
"github.com/kai-scheduler/KAI-scheduler/pkg/common/constants"
)

type Plugin struct {
kubeClient client.Client
}

func New(kubeClient client.Client) *Plugin {
return &Plugin{kubeClient: kubeClient}
}

func (p *Plugin) Name() string {
return "hamicore"
}

func (p *Plugin) PreBind(
ctx context.Context, pod *v1.Pod, node *v1.Node, bindRequest *v1alpha2.BindRequest, _ *state.BindingState,
) error {
if !common.IsSharedGPUAllocation(bindRequest) {
return nil
}

cudaDeviceMemoryLimit, err := calculateCudaDeviceMemoryLimit(node, bindRequest)
if err != nil {
return nil
}

containerRef, err := common.GetFractionContainerRef(pod)
if err != nil {
return fmt.Errorf("failed to get fraction container ref: %w", err)
}

return common.SetCudaDeviceMemoryLimit(ctx, p.kubeClient, pod, containerRef, cudaDeviceMemoryLimit)
}

func calculateCudaDeviceMemoryLimit(node *v1.Node, bindRequest *v1alpha2.BindRequest) (string, error) {
if node == nil || bindRequest == nil || bindRequest.Spec.ReceivedGPU == nil {
return "", fmt.Errorf("missing data for CUDA_DEVICE_MEMORY_LIMIT calculation")
}

memoryLabel, found := node.Labels[constants.NvidiaGpuMemory]
if !found {
return "", fmt.Errorf("node does not include %s label", constants.NvidiaGpuMemory)
}

totalGPUMemoryMib, err := strconv.ParseInt(memoryLabel, 10, 64)
if err != nil || totalGPUMemoryMib <= 0 {
return "", fmt.Errorf("invalid %s label value %q", constants.NvidiaGpuMemory, memoryLabel)
}

gpuPortion, err := strconv.ParseFloat(bindRequest.Spec.ReceivedGPU.Portion, 64)
if err != nil || gpuPortion <= 0 {
return "", fmt.Errorf("invalid received gpu portion %q", bindRequest.Spec.ReceivedGPU.Portion)
}

allocatedMemoryMib := int64(float64(totalGPUMemoryMib) * gpuPortion)
if allocatedMemoryMib <= 0 {
return "", fmt.Errorf("calculated allocated gpu memory is zero")
}

return strconv.FormatInt(allocatedMemoryMib, 10), nil
}

func (p *Plugin) PostBind(
context.Context, *v1.Pod, *v1.Node, *v1alpha2.BindRequest, *state.BindingState,
) {
}

func (p *Plugin) Rollback(
context.Context, *v1.Pod, *v1.Node, *v1alpha2.BindRequest, *state.BindingState,
) error {
return nil
}
3 changes: 3 additions & 0 deletions pkg/operator/operands/binder/binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,15 @@ var _ = Describe("Binder", func() {
Expect(pluginConfig).To(HaveKey(binderplugins.VolumeBindingPluginName))
Expect(pluginConfig).To(HaveKey(binderplugins.DynamicResourcesPluginName))
Expect(pluginConfig).To(HaveKey(binderplugins.GPUSharingPluginName))
Expect(pluginConfig).To(HaveKey(binderplugins.HamiCorePluginName))
Expect(pluginConfig[binderplugins.VolumeBindingPluginName].Arguments[binderplugins.BindTimeoutSecondsArgument]).
To(Equal(strconv.Itoa(binderplugins.DefaultBindTimeoutSeconds)))
Expect(pluginConfig[binderplugins.DynamicResourcesPluginName].Arguments[binderplugins.BindTimeoutSecondsArgument]).
To(Equal(strconv.Itoa(binderplugins.DefaultBindTimeoutSeconds)))
Expect(pluginConfig[binderplugins.GPUSharingPluginName].Arguments[binderplugins.CDIEnabledArgument]).
To(Equal(strconv.FormatBool(binderplugins.DefaultCDIEnabled)))
Expect(pluginConfig[binderplugins.HamiCorePluginName].Enabled).NotTo(BeNil())
Expect(*pluginConfig[binderplugins.HamiCorePluginName].Enabled).To(BeFalse())
})

It("passes volume binding timeout through plugin arguments", func(ctx context.Context) {
Expand Down
Loading