Skip to content

Commit ed6ebbe

Browse files
ScaleUp for check-capacity ProvisioningRequestClass (#6451)
* ScaleUp for check-capacity ProvisioningRequestClass * update condition logic * Update tests * Naming update * Update cluster-autoscaler/core/scaleup/orchestrator/wrapper_orchestrator_test.go Co-authored-by: Bartek Wróblewski <[email protected]> --------- Co-authored-by: Bartek Wróblewski <[email protected]>
1 parent cf171a7 commit ed6ebbe

File tree

18 files changed

+1395
-50
lines changed

18 files changed

+1395
-50
lines changed

cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
8888
nodeInfos map[string]*schedulerframework.NodeInfo,
8989
) (*status.ScaleUpStatus, errors.AutoscalerError) {
9090
if !o.initialized {
91-
return scaleUpError(&status.ScaleUpStatus{}, errors.NewAutoscalerError(errors.InternalError, "ScaleUpOrchestrator is not initialized"))
91+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, errors.NewAutoscalerError(errors.InternalError, "ScaleUpOrchestrator is not initialized"))
9292
}
9393

9494
loggingQuota := klogx.PodsLoggingQuota()
@@ -103,7 +103,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
103103

104104
upcomingNodes, aErr := o.UpcomingNodes(nodeInfos)
105105
if aErr != nil {
106-
return scaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not get upcoming nodes: "))
106+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not get upcoming nodes: "))
107107
}
108108
klog.V(4).Infof("Upcoming %d nodes", len(upcomingNodes))
109109

@@ -112,7 +112,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
112112
var err error
113113
nodeGroups, nodeInfos, err = o.processors.NodeGroupListProcessor.Process(o.autoscalingContext, nodeGroups, nodeInfos, unschedulablePods)
114114
if err != nil {
115-
return scaleUpError(&status.ScaleUpStatus{}, errors.ToAutoscalerError(errors.InternalError, err))
115+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, errors.ToAutoscalerError(errors.InternalError, err))
116116
}
117117
}
118118

@@ -121,7 +121,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
121121

122122
resourcesLeft, aErr := o.resourceManager.ResourcesLeft(o.autoscalingContext, nodeInfos, nodes)
123123
if aErr != nil {
124-
return scaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not compute total resources: "))
124+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not compute total resources: "))
125125
}
126126

127127
now := time.Now()
@@ -186,15 +186,15 @@ func (o *ScaleUpOrchestrator) ScaleUp(
186186

187187
newNodes, aErr := o.GetCappedNewNodeCount(bestOption.NodeCount, len(nodes)+len(upcomingNodes))
188188
if aErr != nil {
189-
return scaleUpError(&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, aErr)
189+
return status.UpdateScaleUpError(&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, aErr)
190190
}
191191

192192
createNodeGroupResults := make([]nodegroups.CreateNodeGroupResult, 0)
193193
if !bestOption.NodeGroup.Exist() {
194194
oldId := bestOption.NodeGroup.Id()
195195
createNodeGroupResult, aErr := o.processors.NodeGroupManager.CreateNodeGroup(o.autoscalingContext, bestOption.NodeGroup)
196196
if aErr != nil {
197-
return scaleUpError(
197+
return status.UpdateScaleUpError(
198198
&status.ScaleUpStatus{FailedCreationNodeGroups: []cloudprovider.NodeGroup{bestOption.NodeGroup}, PodsTriggeredScaleUp: bestOption.Pods},
199199
aErr)
200200
}
@@ -253,7 +253,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
253253
if !found {
254254
// This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup.
255255
klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id())
256-
return scaleUpError(
256+
return status.UpdateScaleUpError(
257257
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
258258
errors.NewAutoscalerError(
259259
errors.CloudProviderError,
@@ -263,7 +263,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
263263
// Apply upper limits for CPU and memory.
264264
newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup)
265265
if aErr != nil {
266-
return scaleUpError(
266+
return status.UpdateScaleUpError(
267267
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
268268
aErr)
269269
}
@@ -283,15 +283,15 @@ func (o *ScaleUpOrchestrator) ScaleUp(
283283

284284
scaleUpInfos, aErr := o.processors.NodeGroupSetProcessor.BalanceScaleUpBetweenGroups(o.autoscalingContext, targetNodeGroups, newNodes)
285285
if aErr != nil {
286-
return scaleUpError(
286+
return status.UpdateScaleUpError(
287287
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
288288
aErr)
289289
}
290290

291291
klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
292292
aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
293293
if aErr != nil {
294-
return scaleUpError(
294+
return status.UpdateScaleUpError(
295295
&status.ScaleUpStatus{
296296
CreateNodeGroupResults: createNodeGroupResults,
297297
FailedResizeNodeGroups: failedNodeGroups,
@@ -322,7 +322,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
322322
nodeInfos map[string]*schedulerframework.NodeInfo,
323323
) (*status.ScaleUpStatus, errors.AutoscalerError) {
324324
if !o.initialized {
325-
return scaleUpError(&status.ScaleUpStatus{}, errors.NewAutoscalerError(errors.InternalError, "ScaleUpOrchestrator is not initialized"))
325+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, errors.NewAutoscalerError(errors.InternalError, "ScaleUpOrchestrator is not initialized"))
326326
}
327327

328328
now := time.Now()
@@ -331,7 +331,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
331331

332332
resourcesLeft, aErr := o.resourceManager.ResourcesLeft(o.autoscalingContext, nodeInfos, nodes)
333333
if aErr != nil {
334-
return scaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not compute total resources: "))
334+
return status.UpdateScaleUpError(&status.ScaleUpStatus{}, aErr.AddPrefix("could not compute total resources: "))
335335
}
336336

337337
for _, ng := range nodeGroups {
@@ -397,7 +397,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
397397
klog.V(1).Infof("ScaleUpToNodeGroupMinSize: final scale-up plan: %v", scaleUpInfos)
398398
aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
399399
if aErr != nil {
400-
return scaleUpError(
400+
return status.UpdateScaleUpError(
401401
&status.ScaleUpStatus{
402402
FailedResizeNodeGroups: failedNodeGroups,
403403
},
@@ -717,9 +717,3 @@ func GetPodsAwaitingEvaluation(egs []*equivalence.PodGroup, bestOption string) [
717717
}
718718
return awaitsEvaluation
719719
}
720-
721-
func scaleUpError(s *status.ScaleUpStatus, err errors.AutoscalerError) (*status.ScaleUpStatus, errors.AutoscalerError) {
722-
s.ScaleUpError = &err
723-
s.Result = status.ScaleUpError
724-
return s, err
725-
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package orchestrator
18+
19+
import (
20+
"fmt"
21+
22+
appsv1 "k8s.io/api/apps/v1"
23+
apiv1 "k8s.io/api/core/v1"
24+
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
25+
"k8s.io/autoscaler/cluster-autoscaler/context"
26+
"k8s.io/autoscaler/cluster-autoscaler/core/scaleup"
27+
ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
28+
"k8s.io/autoscaler/cluster-autoscaler/processors/provreq"
29+
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
30+
"k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/checkcapacity"
31+
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
32+
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
33+
"k8s.io/client-go/rest"
34+
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
35+
)
36+
37+
// WrapperOrchestrator is an orchestrator which wraps Scale Up for ProvisioningRequests and regular pods.
38+
// Each loop WrapperOrchestrator split out regular and pods from ProvisioningRequest, pick one group that
39+
// wasn't picked in the last loop and run ScaleUp for it.
40+
type WrapperOrchestrator struct {
41+
// scaleUpRegularPods indicates that ScaleUp for regular pods will be run in the current CA loop, if they are present.
42+
scaleUpRegularPods bool
43+
scaleUpOrchestrator scaleup.Orchestrator
44+
provReqOrchestrator scaleup.Orchestrator
45+
}
46+
47+
// NewWrapperOrchestrator return WrapperOrchestrator
48+
func NewWrapperOrchestrator(kubeConfig *rest.Config) (scaleup.Orchestrator, error) {
49+
provReqOrchestrator, err := checkcapacity.New(kubeConfig)
50+
if err != nil {
51+
return nil, fmt.Errorf("failed create ScaleUp orchestrator for ProvisioningRequests, error: %v", err)
52+
}
53+
return &WrapperOrchestrator{
54+
scaleUpOrchestrator: New(),
55+
provReqOrchestrator: provReqOrchestrator,
56+
}, nil
57+
}
58+
59+
// Initialize initializes the orchestrator object with required fields.
60+
func (o *WrapperOrchestrator) Initialize(
61+
autoscalingContext *context.AutoscalingContext,
62+
processors *ca_processors.AutoscalingProcessors,
63+
clusterStateRegistry *clusterstate.ClusterStateRegistry,
64+
taintConfig taints.TaintConfig,
65+
) {
66+
o.scaleUpOrchestrator.Initialize(autoscalingContext, processors, clusterStateRegistry, taintConfig)
67+
o.provReqOrchestrator.Initialize(autoscalingContext, processors, clusterStateRegistry, taintConfig)
68+
}
69+
70+
// ScaleUp run scaleUp function for regular pods of pods from ProvisioningRequest.
71+
func (o *WrapperOrchestrator) ScaleUp(
72+
unschedulablePods []*apiv1.Pod,
73+
nodes []*apiv1.Node,
74+
daemonSets []*appsv1.DaemonSet,
75+
nodeInfos map[string]*schedulerframework.NodeInfo,
76+
) (*status.ScaleUpStatus, errors.AutoscalerError) {
77+
defer func() { o.scaleUpRegularPods = !o.scaleUpRegularPods }()
78+
79+
provReqPods, regularPods := splitOut(unschedulablePods)
80+
if len(provReqPods) == 0 {
81+
o.scaleUpRegularPods = true
82+
} else if len(regularPods) == 0 {
83+
o.scaleUpRegularPods = false
84+
}
85+
86+
if o.scaleUpRegularPods {
87+
return o.scaleUpOrchestrator.ScaleUp(regularPods, nodes, daemonSets, nodeInfos)
88+
}
89+
return o.provReqOrchestrator.ScaleUp(provReqPods, nodes, daemonSets, nodeInfos)
90+
}
91+
92+
func splitOut(unschedulablePods []*apiv1.Pod) (provReqPods, regularPods []*apiv1.Pod) {
93+
for _, pod := range unschedulablePods {
94+
if _, ok := pod.Annotations[provreq.ProvisioningRequestPodAnnotationKey]; ok {
95+
provReqPods = append(provReqPods, pod)
96+
} else {
97+
regularPods = append(regularPods, pod)
98+
}
99+
}
100+
return
101+
}
102+
103+
// ScaleUpToNodeGroupMinSize tries to scale up node groups that have less nodes
104+
// than the configured min size. The source of truth for the current node group
105+
// size is the TargetSize queried directly from cloud providers. Returns
106+
// appropriate status or error if an unexpected error occurred.
107+
func (o *WrapperOrchestrator) ScaleUpToNodeGroupMinSize(
108+
nodes []*apiv1.Node,
109+
nodeInfos map[string]*schedulerframework.NodeInfo,
110+
) (*status.ScaleUpStatus, errors.AutoscalerError) {
111+
return o.scaleUpOrchestrator.ScaleUpToNodeGroupMinSize(nodes, nodeInfos)
112+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package orchestrator
18+
19+
import (
20+
"testing"
21+
22+
"github.com/stretchr/testify/assert"
23+
appsv1 "k8s.io/api/apps/v1"
24+
apiv1 "k8s.io/api/core/v1"
25+
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
26+
"k8s.io/autoscaler/cluster-autoscaler/context"
27+
ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
28+
"k8s.io/autoscaler/cluster-autoscaler/processors/provreq"
29+
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
30+
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
31+
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
32+
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
33+
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
34+
)
35+
36+
const (
37+
provisioningRequestErrorMsg = "provisioningRequestError"
38+
regularPodsErrorMsg = "regularPodsError"
39+
)
40+
41+
func TestScaleUp(t *testing.T) {
42+
o := WrapperOrchestrator{
43+
provReqOrchestrator: &fakeScaleUp{provisioningRequestErrorMsg},
44+
scaleUpOrchestrator: &fakeScaleUp{regularPodsErrorMsg},
45+
}
46+
regularPods := []*apiv1.Pod{
47+
BuildTestPod("pod-1", 1, 100),
48+
BuildTestPod("pod-2", 1, 100),
49+
}
50+
provReqPods := []*apiv1.Pod{
51+
BuildTestPod("pr-pod-1", 1, 100),
52+
BuildTestPod("pr-pod-2", 1, 100),
53+
}
54+
for _, pod := range provReqPods {
55+
pod.Annotations[provreq.ProvisioningRequestPodAnnotationKey] = "true"
56+
}
57+
unschedulablePods := append(regularPods, provReqPods...)
58+
_, err := o.ScaleUp(unschedulablePods, nil, nil, nil)
59+
assert.Equal(t, err.Error(), provisioningRequestErrorMsg)
60+
_, err = o.ScaleUp(unschedulablePods, nil, nil, nil)
61+
assert.Equal(t, err.Error(), regularPodsErrorMsg)
62+
}
63+
64+
type fakeScaleUp struct {
65+
errorMsg string
66+
}
67+
68+
func (f *fakeScaleUp) ScaleUp(
69+
unschedulablePods []*apiv1.Pod,
70+
nodes []*apiv1.Node,
71+
daemonSets []*appsv1.DaemonSet,
72+
nodeInfos map[string]*schedulerframework.NodeInfo,
73+
) (*status.ScaleUpStatus, errors.AutoscalerError) {
74+
return nil, errors.NewAutoscalerError(errors.InternalError, f.errorMsg)
75+
}
76+
77+
func (f *fakeScaleUp) Initialize(
78+
autoscalingContext *context.AutoscalingContext,
79+
processors *ca_processors.AutoscalingProcessors,
80+
clusterStateRegistry *clusterstate.ClusterStateRegistry,
81+
taintConfig taints.TaintConfig,
82+
) {
83+
}
84+
85+
func (f *fakeScaleUp) ScaleUpToNodeGroupMinSize(
86+
nodes []*apiv1.Node,
87+
nodeInfos map[string]*schedulerframework.NodeInfo,
88+
) (*status.ScaleUpStatus, errors.AutoscalerError) {
89+
return nil, nil
90+
}

cluster-autoscaler/main.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/actuation"
32+
"k8s.io/autoscaler/cluster-autoscaler/core/scaleup/orchestrator"
3233
"k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot"
3334
"k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker"
3435
kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config"
@@ -468,6 +469,15 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter
468469
deleteOptions := options.NewNodeDeleteOptions(autoscalingOptions)
469470
drainabilityRules := rules.Default(deleteOptions)
470471

472+
scaleUpOrchestrator := orchestrator.New()
473+
if *provisioningRequestsEnabled {
474+
kubeClient := kube_util.GetKubeConfig(autoscalingOptions.KubeClientOpts)
475+
scaleUpOrchestrator, err = orchestrator.NewWrapperOrchestrator(kubeClient)
476+
if err != nil {
477+
return nil, err
478+
}
479+
}
480+
471481
opts := core.AutoscalerOptions{
472482
AutoscalingOptions: autoscalingOptions,
473483
ClusterSnapshot: clustersnapshot.NewDeltaClusterSnapshot(),
@@ -477,6 +487,7 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter
477487
PredicateChecker: predicateChecker,
478488
DeleteOptions: deleteOptions,
479489
DrainabilityRules: drainabilityRules,
490+
ScaleUpOrchestrator: scaleUpOrchestrator,
480491
}
481492

482493
opts.Processors = ca_processors.DefaultProcessors(autoscalingOptions)

cluster-autoscaler/processors/provreq/provisioning_request_processors.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ import (
2828
)
2929

3030
const (
31-
provisioningRequestPodAnnotationKey = "cluster-autoscaler.kubernetes.io/consume-provisioning-request"
31+
// ProvisioningRequestPodAnnotationKey is an annotation on pod that indicate that pod was created by ProvisioningRequest.
32+
ProvisioningRequestPodAnnotationKey = "cluster-autoscaler.kubernetes.io/consume-provisioning-request"
3233
maxProvReqEvent = 50
3334
)
3435

@@ -101,6 +102,6 @@ func provisioningRequestName(pod *v1.Pod) (string, bool) {
101102
if pod == nil || pod.Annotations == nil {
102103
return "", false
103104
}
104-
provReqName, found := pod.Annotations[provisioningRequestPodAnnotationKey]
105+
provReqName, found := pod.Annotations[ProvisioningRequestPodAnnotationKey]
105106
return provReqName, found
106107
}

0 commit comments

Comments
 (0)