Skip to content

Commit a52fb40

Browse files
committed
s
Signed-off-by: Omer Yahud <[email protected]>
1 parent 77e56b4 commit a52fb40

File tree

17 files changed

+494
-19
lines changed

17 files changed

+494
-19
lines changed

cmd/admission/app/app.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
// to ensure that exec-entrypoint and run can make use of them.
1414
_ "k8s.io/client-go/plugin/pkg/client/auth"
1515

16+
ocpconf "github.com/openshift/api/config/v1"
1617
"github.com/spf13/pflag"
1718
"go.uber.org/zap/zapcore"
1819
corev1 "k8s.io/api/core/v1"
@@ -41,8 +42,8 @@ var (
4142

4243
func init() {
4344
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
44-
4545
utilruntime.Must(schedulingv1alpha2.AddToScheme(scheme))
46+
utilruntime.Must(ocpconf.AddToScheme(scheme))
4647
// +kubebuilder:scaffold:scheme
4748
}
4849

@@ -174,5 +175,6 @@ func (app *App) Run() error {
174175
setupLog.Error(err, "problem running manager")
175176
return err
176177
}
178+
177179
return nil
178180
}

cmd/admission/app/options.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
package app
55

66
import (
7+
"fmt"
8+
79
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
810
"github.com/spf13/pflag"
911

@@ -22,6 +24,7 @@ type Options struct {
2224
WebhookPort int
2325
FakeGPUNodes bool
2426
GPUSharingEnabled bool
27+
GPUPodRuntimeClassName string
2528
}
2629

2730
func InitOptions() *Options {
@@ -63,6 +66,10 @@ func InitOptions() *Options {
6366
fs.BoolVar(&options.GPUSharingEnabled,
6467
"gpu-sharing-enabled", false,
6568
"Specifies if the GPU sharing is enabled")
69+
fs.StringVar(&options.GPUPodRuntimeClassName,
70+
"gpu-pod-runtime-class-name", constants.DefaultRuntimeClassName,
71+
fmt.Sprintf("Runtime class to be set for GPU pods (defaults to %s), ignored in Openshift deployments."+
72+
"Set to empty string to disable", constants.DefaultRuntimeClassName))
6673

6774
utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
6875

cmd/admission/main.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,13 @@ func registerPlugins(app *app.App) error {
4343
admissionPlugins := plugins.New()
4444

4545
admissionGpuSharingPlugin := gpusharing.New(app.Client, app.Options.GPUSharingEnabled)
46-
admissionRuntimeEnforcementPlugin := runtimeenforcement.New(app.Client)
47-
4846
admissionPlugins.RegisterPlugin(admissionGpuSharingPlugin)
49-
admissionPlugins.RegisterPlugin(admissionRuntimeEnforcementPlugin)
47+
48+
if app.Options.GPUPodRuntimeClassName != "" {
49+
admissionRuntimeEnforcementPlugin := runtimeenforcement.New(app.Client, app.Options.GPUPodRuntimeClassName)
50+
admissionPlugins.RegisterPlugin(admissionRuntimeEnforcementPlugin)
51+
}
52+
5053
app.RegisterPlugins(admissionPlugins)
5154
return nil
5255
}

deployments/kai-scheduler/templates/kai-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ spec:
105105
targetPort: {{ .Values.admission.ports.webhookPort | default 9443 }}
106106
probePort: {{ .Values.admission.ports.probePort | default 8081 }}
107107
metricsPort: {{ .Values.admission.ports.metricsPort | default 8080 }}
108+
gpuPodRuntimeClassName: {{ .Values.admission.gpuPodRuntimeClassName | default "nvidia" }}
108109

109110
nodeScaleAdjuster:
110111
service:

deployments/kai-scheduler/templates/rbac/admission.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,19 @@ rules:
3838
- create
3939
- patch
4040
- update
41+
- apiGroups:
42+
- node.k8s.io
43+
resources:
44+
- runtimeclasses
45+
verbs:
46+
- get
47+
- list
48+
- watch
49+
- apiGroups:
50+
- config.openshift.io
51+
resources:
52+
- clusterversions
53+
verbs:
54+
- get
55+
- list
56+
- watch

deployments/kai-scheduler/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ admission:
7676
metricsPort: 8080
7777
probePort: 8081
7878
cdi: false
79+
gpuPodRuntimeClassName: nvidia
7980

8081
nodescaleadjuster:
8182
image:

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ require (
1818
github.com/kubeflow/training-operator v1.9.3
1919
github.com/onsi/ginkgo/v2 v2.25.3
2020
github.com/onsi/gomega v1.38.2
21+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7
2122
github.com/pkg/errors v0.9.1
2223
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.81.0
2324
github.com/prometheus/client_golang v1.23.2

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
246246
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
247247
github.com/opencontainers/selinux v1.11.1 h1:nHFvthhM0qY8/m+vfhJylliSshm8G1jJ2jDMcgULaH8=
248248
github.com/opencontainers/selinux v1.11.1/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
249+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 h1:dZ9uBd0Cw3+l1RGpYRkWdrRjM9yvfxrjW/uPHKUwtIQ=
250+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7/go.mod h1:yk60tHAmHhtVpJQo3TwVYq2zpuP70iJIFDCmeKMIzPw=
249251
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
250252
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
251253
github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=

pkg/admission/webhook/v1alpha2/runtimeenforcement/__debug_bin2990338812

Whitespace-only changes.

pkg/admission/webhook/v1alpha2/runtimeenforcement/runtime_enforcement.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,26 @@ package runtimeenforcement
55

66
import (
77
"context"
8+
"fmt"
89

910
v1 "k8s.io/api/core/v1"
11+
"k8s.io/utils/ptr"
1012
"sigs.k8s.io/controller-runtime/pkg/client"
1113

1214
"github.com/NVIDIA/KAI-scheduler/pkg/binder/binding/resourcereservation"
13-
"github.com/NVIDIA/KAI-scheduler/pkg/binder/common"
14-
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
1515
"github.com/NVIDIA/KAI-scheduler/pkg/common/k8s_utils"
1616
"github.com/NVIDIA/KAI-scheduler/pkg/common/resources"
1717
)
1818

1919
type RuntimeEnforcement struct {
20-
kubeClient client.Client
20+
kubeClient client.Client
21+
gpuPodRuntimeClassName string
2122
}
2223

23-
func New(kubeClient client.Client) *RuntimeEnforcement {
24+
func New(kubeClient client.Client, gpuPodRuntimeClassName string) *RuntimeEnforcement {
2425
return &RuntimeEnforcement{
25-
kubeClient: kubeClient,
26+
kubeClient: kubeClient,
27+
gpuPodRuntimeClassName: gpuPodRuntimeClassName,
2628
}
2729
}
2830

@@ -50,15 +52,23 @@ func (p *RuntimeEnforcement) Mutate(pod *v1.Pod) error {
5052

5153
if resources.RequestsGPU(pod) {
5254
exists, err := k8s_utils.RuntimeClassExists(context.Background(),
53-
p.kubeClient, constants.DefaultRuntimeClassName)
55+
p.kubeClient, p.gpuPodRuntimeClassName)
5456
if err != nil {
5557
return err
5658
} else if !exists {
57-
return nil
59+
return runtimeClassDoesNotExistError(p.gpuPodRuntimeClassName)
5860
}
5961

60-
common.SetNVIDIARuntimeClass(pod)
62+
setRuntimeClass(pod, p.gpuPodRuntimeClassName)
6163
}
6264

6365
return nil
6466
}
67+
68+
func setRuntimeClass(pod *v1.Pod, runtimeClassName string) {
69+
pod.Spec.RuntimeClassName = ptr.To(runtimeClassName)
70+
}
71+
72+
func runtimeClassDoesNotExistError(runtimeClassName string) error {
73+
return fmt.Errorf("cannot set runtimeClassName: runtimeClass '%s' does not exist", runtimeClassName)
74+
}

0 commit comments

Comments
 (0)