Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Rebuilt the `crd-upgrader` hook image on `alpine:3.20` instead of `ubi9/ubi-minimal`. Image size drops from ~165 MB to ~67 MB uncompressed (~60% reduction), shrinking cold-pull latency on ephemeral CI runners. The image is also reused by the `topology-migration` and `post-delete` hook jobs as a generic `kubectl + bash` toolbox, so bash is preserved on the runtime image. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404)

### Fixed
- Account for native sidecar containers (initContainers with `restartPolicy: Always`, KEP-753) in pod resource accounting, matching kubelet's `AggregateContainerRequests`. Previously, native sidecar requests were max'd against regular containers instead of summed with them, causing the scheduler to bind pods that kubelet then rejected at admission with `OutOfCpu`/`OutOfGpu`. [#1556](https://github.com/kai-scheduler/KAI-Scheduler/pull/1556)
- Fixed `additionalImagePullSecrets` in Config CR rendering as `map[name:...]` instead of plain strings by extracting `.name` from `global.imagePullSecrets` objects. Also propagated `global.imagePullSecrets` to all Helm hook jobs (`crd-upgrader`, `topology-migration`, `post-delete-cleanup`)
- Added `global.nodeSelector`, `global.tolerations`, `global.affinity`, `global.securityContext` support to the post-delete job hook.
- Fixed Helm template writing `imagesPullSecret` (string) instead of `additionalImagePullSecrets` (array) in Config CR, causing image pull secrets to be silently ignored. Added backward-compatible deprecated `imagesPullSecret` field to CRD schema. [#942](https://github.com/kai-scheduler/KAI-Scheduler/issues/942)
Expand Down
43 changes: 34 additions & 9 deletions pkg/scheduler/api/pod_info/pod_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,25 +380,50 @@ func getPodGroupID(pod *v1.Pod) common_info.PodGroupID {
func getPodResourceRequest(pod *v1.Pod) *resource_info.ResourceRequirements {
result := getPodResourceWithoutInitContainers(pod)

// take max_resource(sum_pod, any_init_container)
for _, container := range pod.Spec.InitContainers {
err := result.SetMaxResource(resource_info.RequirementsFromResourceList(container.Resources.Requests))
if err != nil {
log.InfraLogger.Errorf("Failed to calculate pod required resources for pod %s/%s. Error: %s",
pod.Namespace, pod.Name, err.Error())
}
}
sidecarSum, initPhasePeak := initContainerEffects(pod)
logIfErr(pod, result.Add(sidecarSum))
logIfErr(pod, result.SetMaxResource(initPhasePeak))

if pod.Spec.Overhead != nil {
overheadReq := resource_info.RequirementsFromResourceList(pod.Spec.Overhead)
result.Add(&overheadReq.BaseResource)
result.BaseResource.Add(&overheadReq.BaseResource)
}

result.ScalarResources()[resource_info.PodsResourceName] = 1

return result
}

// initContainerEffects returns the contributions of `pod`'s init containers to
// pod resource accounting, mirroring kubelet's `AggregateContainerRequests`:
// - sidecarSum: total request of native sidecars (initContainers with
// `restartPolicy: Always`, KEP-753), which run concurrently with regular
// containers and add to the steady-state sum.
// - initPhasePeak: max over each non-restartable init of `init.Requests +
// sum(native sidecars declared before it)`, since those sidecars are
// already running when the init runs.
func initContainerEffects(pod *v1.Pod) (sidecarSum, initPhasePeak *resource_info.ResourceRequirements) {
sidecarSum = resource_info.EmptyResourceRequirements()
initPhasePeak = resource_info.EmptyResourceRequirements()
for _, container := range pod.Spec.InitContainers {
containerReq := resource_info.RequirementsFromResourceList(container.Resources.Requests)
if container.RestartPolicy != nil && *container.RestartPolicy == v1.ContainerRestartPolicyAlways {
logIfErr(pod, sidecarSum.Add(containerReq))
continue
}
logIfErr(pod, containerReq.Add(sidecarSum))
logIfErr(pod, initPhasePeak.SetMaxResource(containerReq))
}
return sidecarSum, initPhasePeak
}

func logIfErr(pod *v1.Pod, err error) {
if err != nil {
log.InfraLogger.Errorf("Failed to calculate pod required resources for pod %s/%s. Error: %s",
pod.Namespace, pod.Name, err.Error())
}
}

// getPodResourceWithoutInitContainers returns Pod's resource request, it does not contain
// init containers' resource request.
func getPodResourceWithoutInitContainers(pod *v1.Pod) *resource_info.ResourceRequirements {
Expand Down
100 changes: 100 additions & 0 deletions pkg/scheduler/api/pod_info/pod_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,106 @@ func TestGetPodResourceRequest(t *testing.T) {
},
expectedResource: resource_info.NewResourceRequirements(1, 3000, 5000000000),
},
{
name: "pod with native sidecar (initContainer with restartPolicy=Always)",
pod: &v1.Pod{
Spec: v1.PodSpec{
InitContainers: []v1.Container{
{
// Native sidecar — added to running sum.
RestartPolicy: ptr.To(v1.ContainerRestartPolicyAlways),
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("250m", "256Mi"),
},
},
{
// Regular init container — max'd against running sum.
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("500m", "1G"),
},
},
},
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("4000m", "8Gi"),
},
},
},
},
},
// containers (4000m, 8Gi) + sidecar (250m, 256Mi) = 4250m, 8Gi+256Mi.
// Regular init (500m, 1G) is below that, so max yields running sum.
expectedResource: resource_info.RequirementsFromResourceList(
common_info.BuildResourceList("4250m", "8858370048"),
),
},
{
// Mirrors upstream `AggregateContainerRequests` (KEP-753): a
// regular initContainer's peak demand includes any native sidecars
// declared before it, since those sidecars start first and run
// concurrently with the init.
name: "regular init dominates and includes preceding native sidecar",
pod: &v1.Pod{
Spec: v1.PodSpec{
InitContainers: []v1.Container{
{
RestartPolicy: ptr.To(v1.ContainerRestartPolicyAlways),
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("500m", "256Mi"),
},
},
{
// Regular init dominates the steady-state sum.
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("5000m", "1Gi"),
},
},
},
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceList("1000m", "1Gi"),
},
},
},
},
},
// steady-state = main(1000m,1Gi) + sidecar(500m,256Mi) = 1500m, 1Gi+256Mi.
// init-phase peak = init(5000m,1Gi) + sidecar(500m,256Mi) = 5500m, 1Gi+256Mi.
// max → 5500m, 1Gi+256Mi (= 1342177280 bytes).
expectedResource: resource_info.RequirementsFromResourceList(
common_info.BuildResourceList("5500m", "1342177280"),
),
},
{
// Native sidecars that request GPUs must contribute to the GPU
// half of the running sum, not be silently dropped via method
// promotion to BaseResource.Add.
name: "native sidecar with GPU is summed into pod GPU request",
pod: &v1.Pod{
Spec: v1.PodSpec{
InitContainers: []v1.Container{
{
RestartPolicy: ptr.To(v1.ContainerRestartPolicyAlways),
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceListWithGPU("250m", "256Mi", "1"),
},
},
},
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: common_info.BuildResourceListWithGPU("4000m", "8Gi", "2"),
},
},
},
},
},
// main(4000m, 8Gi, 2 GPUs) + sidecar(250m, 256Mi, 1 GPU) =
// 4250m, 8Gi+256Mi, 3 GPUs.
expectedResource: resource_info.NewResourceRequirements(3, 4250, 8858370048),
},
{
name: "pod with overhead resources",
pod: &v1.Pod{
Expand Down
27 changes: 27 additions & 0 deletions pkg/scheduler/api/resource_info/gpu_resource_requirment.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,33 @@ func (g *GpuResourceRequirement) Clone() *GpuResourceRequirement {
}
}

// Add sums `gg` into `g`. KAI represents whole and fractional GPU requests as
// `count × portion`; summing two requirements with different non-zero
// fractional portions has no canonical representation, so we surface that as
// an error (mirroring `SetMaxResource`). Whole-GPU additions (portion = 1) and
// summing into an empty receiver are well-defined.
func (g *GpuResourceRequirement) Add(gg *GpuResourceRequirement) error {
if gg == nil {
return nil
}
if g.portion == 0 && g.count == 0 {
g.count = gg.count
g.portion = gg.portion
} else if gg.portion != 0 || gg.count != 0 {
if g.portion != gg.portion {
return fmt.Errorf("cannot add GpuResourceRequirements with different fractional portions: %v vs %v", g.portion, gg.portion)
}
g.count += gg.count
}
for name, ggQuant := range gg.draGpuCounts {
g.draGpuCounts[name] += ggQuant
}
for name, ggQuant := range gg.migResources {
g.migResources[name] += ggQuant
}
return nil
}

func (g *GpuResourceRequirement) SetMaxResource(gg *GpuResourceRequirement) error {
if g.portion != 0 && gg.portion != 0 && g.portion != gg.portion {
return fmt.Errorf("cannot calculate max resource for GpuResourceRequirements with different fractional portions. %v vs %v", g.portion, gg.portion)
Expand Down
10 changes: 10 additions & 0 deletions pkg/scheduler/api/resource_info/resource_requirment.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ func (r *ResourceRequirements) SetMaxResource(rr *ResourceRequirements) error {
return r.GpuResourceRequirement.SetMaxResource(&rr.GpuResourceRequirement)
}

// Add sums `rr` into `r` across both `BaseResource` and
// `GpuResourceRequirement`.
func (r *ResourceRequirements) Add(rr *ResourceRequirements) error {
if r == nil || rr == nil {
return nil
}
r.BaseResource.Add(&rr.BaseResource)
return r.GpuResourceRequirement.Add(&rr.GpuResourceRequirement)
}

func (r *ResourceRequirements) LessInAtLeastOneResource(rr *ResourceRequirements) bool {
return !rr.LessEqual(r)
}
Expand Down
Loading