diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f65757ae..d378cf537 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Allow the configuration of plugins in the binder service. [#1480](https://github.com/kai-scheduler/KAI-Scheduler/pull/1480) - [davidLif](https://github.com/davidLif) - Added support for configuring scheduler log level and custom scheduler args via Helm values (`scheduler.args`) [#1452](https://github.com/kai-scheduler/KAI-Scheduler/pull/1452) [dttung2905](https://github.com/dttung2905) - Added `crdupgrader.image.registry` Helm value to override `global.registry` for the `crd-upgrader` pre-install/pre-upgrade hook image, allowing the hook image to be served from a separate mirror without redirecting all chart images. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404) +- Added an opt-in `hamicore` binder plugin (depends on `gpusharing`) to write the HAMI-core GPU memory limit (`CUDA_DEVICE_MEMORY_LIMIT`) for fractional GPU pods. ### Changed - **Breaking:** JobSet PodGroups no longer auto-calculate `minAvailable` from `parallelism × replicas`. The default is now 1. Use the `kai.scheduler/batch-min-member` annotation to set a custom value. diff --git a/docs/developer/binder.md b/docs/developer/binder.md index e9a9c8e96..6bbf381a0 100644 --- a/docs/developer/binder.md +++ b/docs/developer/binder.md @@ -121,8 +121,9 @@ The current default binder plugins are: | `volumebinding` | 300 | `bindTimeoutSeconds: "120"` | Handles Kubernetes persistent volume binding before pod bind. | | `dynamicresources` | 200 | `bindTimeoutSeconds: "120"` | Handles Kubernetes Dynamic Resource Allocation claim binding before pod bind. | | `gpusharing` | 100 | `cdiEnabled: "false"` | Handles fractional GPU pod mutation needed for GPU sharing. | +| `hamicore` | 50 | | Optional HAMI-core GPU virtualization for fractional GPU pods. Depends on `gpusharing`. Disabled by default. | -For operator-managed deployments, the operator sets the `gpusharing` `cdiEnabled` argument from `spec.binder.cdiEnabled`. If `spec.binder.cdiEnabled` is unset, the operator attempts to auto-detect CDI from the NVIDIA GPU Operator `ClusterPolicy`. +For operator-managed deployments, the operator sets the `gpusharing` `cdiEnabled` argument from `spec.binder.cdiEnabled`. If `spec.binder.cdiEnabled` is unset, the operator attempts to auto-detect CDI from the NVIDIA GPU Operator `ClusterPolicy`. Enable `hamicore` to opt into HAMI-core GPU memory limits. ### Config Examples @@ -197,6 +198,10 @@ The dynamic resources plugin handles Kubernetes Dynamic Resource Allocation reso The GPU sharing plugin handles fractional GPU assignments. For shared GPU allocations it creates the required GPU sharing ConfigMaps and sets the NVIDIA visible devices and GPU portion information on the target container. +#### HAMI-core Plugin + +The HAMI-core plugin is disabled by default and **requires `gpusharing` to be enabled**. When enabled, it writes `CUDA_DEVICE_MEMORY_LIMIT` to the GPU sharing ConfigMap based on node label `nvidia.com/gpu.memory` and `BindRequest.spec.receivedGPU.portion`. + ### Creating Custom Plugins To create a custom binder plugin: diff --git a/pkg/apis/kai/v1/binder/binder.go b/pkg/apis/kai/v1/binder/binder.go index 77969188a..9134d4471 100644 --- a/pkg/apis/kai/v1/binder/binder.go +++ b/pkg/apis/kai/v1/binder/binder.go @@ -22,6 +22,7 @@ const ( VolumeBindingPluginName = "volumebinding" DynamicResourcesPluginName = "dynamicresources" GPUSharingPluginName = "gpusharing" + HamiCorePluginName = "hamicore" BindTimeoutSecondsArgument = "bindTimeoutSeconds" CDIEnabledArgument = "cdiEnabled" @@ -34,6 +35,7 @@ var defaultPluginPriorities = map[string]int{ VolumeBindingPluginName: 300, DynamicResourcesPluginName: 200, GPUSharingPluginName: 100, + HamiCorePluginName: 50, } // PluginConfig allows overriding binder plugin settings. @@ -84,7 +86,7 @@ type Binder struct { // Plugins allows overriding binder plugin configuration. Keys are plugin names. // Built-in plugins can be disabled, reordered, or have their arguments changed. - // Built-in plugins: volumebinding, dynamicresources, gpusharing. + // Built-in plugins: volumebinding, dynamicresources, gpusharing, hamicore. // +kubebuilder:validation:Optional Plugins map[string]PluginConfig `json:"plugins,omitempty"` @@ -196,6 +198,10 @@ func DefaultPluginsConfig(bindTimeoutSeconds int, cdiEnabled bool) map[string]Pl CDIEnabledArgument: strconv.FormatBool(cdiEnabled), }, }, + HamiCorePluginName: { + Enabled: ptr.To(false), + Priority: ptr.To(defaultPluginPriorities[HamiCorePluginName]), + }, } } diff --git a/pkg/apis/kai/v1/binder/binder_test.go b/pkg/apis/kai/v1/binder/binder_test.go index fab492a94..3d65d1bdc 100644 --- a/pkg/apis/kai/v1/binder/binder_test.go +++ b/pkg/apis/kai/v1/binder/binder_test.go @@ -51,6 +51,8 @@ var _ = Describe("Binder", func() { binder.SetDefaultsWhereNeeded(nil, nil) Expect(binder.Plugins[GPUSharingPluginName].Arguments[CDIEnabledArgument]). To(Equal(strconv.FormatBool(true))) + Expect(binder.Plugins[HamiCorePluginName].Enabled).NotTo(BeNil()) + Expect(*binder.Plugins[HamiCorePluginName].Enabled).To(BeFalse()) }) It("Set Defaults With Plugin Overrides", func(ctx context.Context) { diff --git a/pkg/binder/common/constants.go b/pkg/binder/common/constants.go index 30e7e4c0d..72d9ec324 100644 --- a/pkg/binder/common/constants.go +++ b/pkg/binder/common/constants.go @@ -4,7 +4,8 @@ package common const ( - GPUPortion = "GPU_PORTION" - ReceivedTypeFraction = "Fraction" - ReceivedTypeRegular = "Regular" + GPUPortion = "GPU_PORTION" + CudaDeviceMemoryLimit = "CUDA_DEVICE_MEMORY_LIMIT" + ReceivedTypeFraction = "Fraction" + ReceivedTypeRegular = "Regular" ) diff --git a/pkg/binder/common/gpu_access.go b/pkg/binder/common/gpu_access.go index d0a578e10..a787648aa 100644 --- a/pkg/binder/common/gpu_access.go +++ b/pkg/binder/common/gpu_access.go @@ -9,6 +9,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -58,6 +59,19 @@ func AddGPUSharingEnvVars(container *v1.Container, sharedGpuConfigMapName string }, }, }) + + AddEnvVarToContainer(container, v1.EnvVar{ + Name: CudaDeviceMemoryLimit, + ValueFrom: &v1.EnvVarSource{ + ConfigMapKeyRef: &v1.ConfigMapKeySelector{ + Key: CudaDeviceMemoryLimit, + LocalObjectReference: v1.LocalObjectReference{ + Name: sharedGpuConfigMapName, + }, + Optional: ptr.To(true), + }, + }, + }) } func SetNvidiaVisibleDevices( @@ -122,6 +136,27 @@ func SetGPUPortion( return nil } +func SetCudaDeviceMemoryLimit( + ctx context.Context, kubeClient client.Client, pod *v1.Pod, containerRef *gpusharingconfigmap.PodContainerRef, + cudaDeviceMemoryLimit string, +) error { + updateFunc := func(data map[string]string) error { + data[CudaDeviceMemoryLimit] = cudaDeviceMemoryLimit + return nil + } + capabilitiesMapName, err := gpusharingconfigmap.ExtractCapabilitiesConfigMapName(pod, containerRef) + if err != nil { + return err + } + + err = UpdateConfigMapEnvironmentVariable(ctx, kubeClient, pod, capabilitiesMapName, updateFunc) + if err != nil { + return fmt.Errorf("failed to update CUDA_DEVICE_MEMORY_LIMIT value in gpu sharing configmap for pod <%s/%s>: %v", + pod.Namespace, pod.Name, err) + } + return nil +} + func UpdateConfigMapEnvironmentVariable( ctx context.Context, kubeclient client.Client, task *v1.Pod, configMapName string, changesFunc func(map[string]string) error, diff --git a/pkg/binder/plugins/config.go b/pkg/binder/plugins/config.go index db917a9d9..d969dc4e5 100644 --- a/pkg/binder/plugins/config.go +++ b/pkg/binder/plugins/config.go @@ -16,6 +16,7 @@ const ( VolumeBindingPluginName = kaiv1binder.VolumeBindingPluginName DynamicResourcesPluginName = kaiv1binder.DynamicResourcesPluginName GPUSharingPluginName = kaiv1binder.GPUSharingPluginName + HamiCorePluginName = kaiv1binder.HamiCorePluginName BindTimeoutSecondsArgument = kaiv1binder.BindTimeoutSecondsArgument CDIEnabledArgument = kaiv1binder.CDIEnabledArgument diff --git a/pkg/binder/plugins/config_test.go b/pkg/binder/plugins/config_test.go index 34fe19a15..e7b33600e 100644 --- a/pkg/binder/plugins/config_test.go +++ b/pkg/binder/plugins/config_test.go @@ -27,6 +27,12 @@ func TestDefaultConfig(t *testing.T) { if got := config[GPUSharingPluginName].Arguments[CDIEnabledArgument]; got != "true" { t.Fatalf("expected gpusharing CDI true, got %q", got) } + if *config[HamiCorePluginName].Enabled { + t.Fatalf("expected hamicore to be disabled by default") + } + if got := len(config[HamiCorePluginName].Arguments); got != 0 { + t.Fatalf("expected hamicore to have no default arguments, got %d", got) + } options := config.EnabledOptions() if got := []string{options[0].Name, options[1].Name, options[2].Name}; !equalStrings(got, diff --git a/pkg/binder/plugins/dependencies_test.go b/pkg/binder/plugins/dependencies_test.go new file mode 100644 index 000000000..4c93d3131 --- /dev/null +++ b/pkg/binder/plugins/dependencies_test.go @@ -0,0 +1,42 @@ +// Copyright 2026 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package plugins + +import ( + "testing" + + "k8s.io/utils/ptr" +) + +func TestValidateDependentPluginsHamiCoreRequiresGPUSharing(t *testing.T) { + cfg := Config{ + GPUSharingPluginName: { + Enabled: ptr.To(false), + }, + HamiCorePluginName: { + Enabled: ptr.To(true), + }, + } + + if err := validateDependentPlugins(cfg); err == nil { + t.Fatalf("expected dependency validation error") + } +} + +func TestValidateDependentPluginsHamiCoreRequiresOrdering(t *testing.T) { + cfg := Config{ + GPUSharingPluginName: { + Enabled: ptr.To(true), + Priority: ptr.To(50), + }, + HamiCorePluginName: { + Enabled: ptr.To(true), + Priority: ptr.To(100), + }, + } + + if err := validateDependentPlugins(cfg); err == nil { + t.Fatalf("expected ordering validation error") + } +} diff --git a/pkg/binder/plugins/factory.go b/pkg/binder/plugins/factory.go index cfe4ca02d..4f3192085 100644 --- a/pkg/binder/plugins/factory.go +++ b/pkg/binder/plugins/factory.go @@ -13,6 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/gpusharing" + "github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/hamicore" k8splugins "github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/k8s-plugins" ) @@ -48,9 +49,14 @@ func InitDefaultPlugins() { RegisterPluginBuilder(VolumeBindingPluginName, newVolumeBindingPlugin) RegisterPluginBuilder(DynamicResourcesPluginName, newDynamicResourcesPlugin) RegisterPluginBuilder(GPUSharingPluginName, newGPUSharingPlugin) + RegisterPluginBuilder(HamiCorePluginName, newHamiCorePlugin) } func BuildConfiguredPlugins(buildContext PluginBuildContext, config Config) (*BinderPlugins, error) { + if err := validateDependentPlugins(config); err != nil { + return nil, err + } + binderPlugins := New() for _, option := range config.EnabledOptions() { builder, found := GetPluginBuilder(option.Name) @@ -101,6 +107,41 @@ func newGPUSharingPlugin(buildContext PluginBuildContext, arguments map[string]s return gpusharing.New(buildContext.KubeClient, cdiEnabled), nil } +func newHamiCorePlugin(buildContext PluginBuildContext, _ map[string]string) (Plugin, error) { + return hamicore.New(buildContext.KubeClient), nil +} + +func validateDependentPlugins(config Config) error { + hamiCoreCfg, hamiCoreFound := config[HamiCorePluginName] + if !hamiCoreFound || (hamiCoreCfg.Enabled != nil && !*hamiCoreCfg.Enabled) { + return nil + } + + gpuSharingCfg, gpuSharingFound := config[GPUSharingPluginName] + if !gpuSharingFound || (gpuSharingCfg.Enabled != nil && !*gpuSharingCfg.Enabled) { + return fmt.Errorf("%q plugin requires %q plugin to be enabled", HamiCorePluginName, GPUSharingPluginName) + } + + // PreBind is invoked in EnabledOptions() order (higher priority first). + // hamicore requires gpusharing to have already created the configmap. + hamiCorePri := ptrDerefInt(config[HamiCorePluginName].Priority, 0) + gpuSharingPri := ptrDerefInt(config[GPUSharingPluginName].Priority, 0) + if gpuSharingPri <= hamiCorePri { + return fmt.Errorf("%q plugin requires %q to run before it (expected %q.priority > %q.priority, got %d <= %d)", + HamiCorePluginName, GPUSharingPluginName, GPUSharingPluginName, HamiCorePluginName, + gpuSharingPri, hamiCorePri) + } + + return nil +} + +func ptrDerefInt(v *int, defaultValue int) int { + if v == nil { + return defaultValue + } + return *v +} + func int64Argument(arguments map[string]string, name string) (int64, error) { value, found := arguments[name] if !found { @@ -124,3 +165,15 @@ func boolArgument(arguments map[string]string, name string) (bool, error) { } return parsed, nil } + +func boolArgumentOrDefault(arguments map[string]string, name string, defaultValue bool) (bool, error) { + value, found := arguments[name] + if !found { + return defaultValue, nil + } + parsed, err := strconv.ParseBool(value) + if err != nil { + return false, fmt.Errorf("invalid argument %q=%q: %w", name, value, err) + } + return parsed, nil +} diff --git a/pkg/binder/plugins/hamicore/hami_core.go b/pkg/binder/plugins/hamicore/hami_core.go new file mode 100644 index 000000000..ee940e554 --- /dev/null +++ b/pkg/binder/plugins/hamicore/hami_core.go @@ -0,0 +1,89 @@ +// Copyright 2026 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package hamicore + +import ( + "context" + "fmt" + "strconv" + + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/kai-scheduler/KAI-scheduler/pkg/apis/scheduling/v1alpha2" + "github.com/kai-scheduler/KAI-scheduler/pkg/binder/common" + "github.com/kai-scheduler/KAI-scheduler/pkg/binder/plugins/state" + "github.com/kai-scheduler/KAI-scheduler/pkg/common/constants" +) + +type Plugin struct { + kubeClient client.Client +} + +func New(kubeClient client.Client) *Plugin { + return &Plugin{kubeClient: kubeClient} +} + +func (p *Plugin) Name() string { + return "hamicore" +} + +func (p *Plugin) PreBind( + ctx context.Context, pod *v1.Pod, node *v1.Node, bindRequest *v1alpha2.BindRequest, _ *state.BindingState, +) error { + if !common.IsSharedGPUAllocation(bindRequest) { + return nil + } + + cudaDeviceMemoryLimit, err := calculateCudaDeviceMemoryLimit(node, bindRequest) + if err != nil { + return nil + } + + containerRef, err := common.GetFractionContainerRef(pod) + if err != nil { + return fmt.Errorf("failed to get fraction container ref: %w", err) + } + + return common.SetCudaDeviceMemoryLimit(ctx, p.kubeClient, pod, containerRef, cudaDeviceMemoryLimit) +} + +func calculateCudaDeviceMemoryLimit(node *v1.Node, bindRequest *v1alpha2.BindRequest) (string, error) { + if node == nil || bindRequest == nil || bindRequest.Spec.ReceivedGPU == nil { + return "", fmt.Errorf("missing data for CUDA_DEVICE_MEMORY_LIMIT calculation") + } + + memoryLabel, found := node.Labels[constants.NvidiaGpuMemory] + if !found { + return "", fmt.Errorf("node does not include %s label", constants.NvidiaGpuMemory) + } + + totalGPUMemoryMib, err := strconv.ParseInt(memoryLabel, 10, 64) + if err != nil || totalGPUMemoryMib <= 0 { + return "", fmt.Errorf("invalid %s label value %q", constants.NvidiaGpuMemory, memoryLabel) + } + + gpuPortion, err := strconv.ParseFloat(bindRequest.Spec.ReceivedGPU.Portion, 64) + if err != nil || gpuPortion <= 0 { + return "", fmt.Errorf("invalid received gpu portion %q", bindRequest.Spec.ReceivedGPU.Portion) + } + + allocatedMemoryMib := int64(float64(totalGPUMemoryMib) * gpuPortion) + if allocatedMemoryMib <= 0 { + return "", fmt.Errorf("calculated allocated gpu memory is zero") + } + + return strconv.FormatInt(allocatedMemoryMib, 10), nil +} + +func (p *Plugin) PostBind( + context.Context, *v1.Pod, *v1.Node, *v1alpha2.BindRequest, *state.BindingState, +) { +} + +func (p *Plugin) Rollback( + context.Context, *v1.Pod, *v1.Node, *v1alpha2.BindRequest, *state.BindingState, +) error { + return nil +} diff --git a/pkg/operator/operands/binder/binder_test.go b/pkg/operator/operands/binder/binder_test.go index e6317c224..94e60e89d 100644 --- a/pkg/operator/operands/binder/binder_test.go +++ b/pkg/operator/operands/binder/binder_test.go @@ -113,12 +113,15 @@ var _ = Describe("Binder", func() { Expect(pluginConfig).To(HaveKey(binderplugins.VolumeBindingPluginName)) Expect(pluginConfig).To(HaveKey(binderplugins.DynamicResourcesPluginName)) Expect(pluginConfig).To(HaveKey(binderplugins.GPUSharingPluginName)) + Expect(pluginConfig).To(HaveKey(binderplugins.HamiCorePluginName)) Expect(pluginConfig[binderplugins.VolumeBindingPluginName].Arguments[binderplugins.BindTimeoutSecondsArgument]). To(Equal(strconv.Itoa(binderplugins.DefaultBindTimeoutSeconds))) Expect(pluginConfig[binderplugins.DynamicResourcesPluginName].Arguments[binderplugins.BindTimeoutSecondsArgument]). To(Equal(strconv.Itoa(binderplugins.DefaultBindTimeoutSeconds))) Expect(pluginConfig[binderplugins.GPUSharingPluginName].Arguments[binderplugins.CDIEnabledArgument]). To(Equal(strconv.FormatBool(binderplugins.DefaultCDIEnabled))) + Expect(pluginConfig[binderplugins.HamiCorePluginName].Enabled).NotTo(BeNil()) + Expect(*pluginConfig[binderplugins.HamiCorePluginName].Enabled).To(BeFalse()) }) It("passes volume binding timeout through plugin arguments", func(ctx context.Context) {