Skip to content

Commit dea20c0

Browse files
authored
Autopilot: Automatic reset/resume of AppWrappers whose pods are using unhealthy resources (#199)
1 parent 68bd9cf commit dea20c0

File tree

12 files changed

+246
-6
lines changed

12 files changed

+246
-6
lines changed

config/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ rules:
1313
- patch
1414
- update
1515
- watch
16+
- apiGroups:
17+
- ""
18+
resources:
19+
- nodes
20+
verbs:
21+
- get
22+
- list
23+
- watch
1624
- apiGroups:
1725
- ""
1826
resources:

hack/create-test-cluster.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ update_test_host
2424
check_prerequisites
2525
pull_images
2626
kind_up_cluster
27+
add_virtual_GPUs
2728
configure_cluster

hack/default-queues.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ metadata:
1010
spec:
1111
namespaceSelector: {} # match all.
1212
resourceGroups:
13-
- coveredResources: ["cpu"]
13+
- coveredResources: ["cpu", "nvidia.com/gpu"]
1414
flavors:
1515
- name: "default-flavor"
1616
resources:
1717
- name: "cpu"
18-
nominalQuota: 4
18+
nominalQuota: 8
19+
- name: "nvidia.com/gpu"
20+
nominalQuota: 16
1921
---
2022
apiVersion: kueue.x-k8s.io/v1beta1
2123
kind: LocalQueue

hack/e2e-util.sh

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,58 @@ function wait_for_appwrapper_controller {
204204
echo ""
205205
}
206206

207+
function add_virtual_GPUs {
208+
# Patch nodes to provide GPUs resources without physical GPUs.
209+
# This is intended to allow testing of our autopilot integration.
210+
211+
# Start communication with cluster
212+
kubectl proxy --port=0 > .port.dat 2>&1 &
213+
proxy_pid=$!
214+
215+
echo "Starting background proxy connection (pid=${proxy_pid})..."
216+
echo "Waiting for proxy process to start."
217+
sleep 5
218+
219+
kube_proxy_port=$(cat .port.dat | awk '{split($5, substrings, ":"); print substrings[2]}')
220+
curl -s 127.0.0.1:${kube_proxy_port} > /dev/null 2>&1
221+
222+
if [[ ! $? -eq 0 ]]; then
223+
echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
224+
kill -9 ${proxy_pid}
225+
exit 1
226+
else
227+
echo "Connected to the kubelet for patching the nodes. Using port ${kube_proxy_port}."
228+
fi
229+
230+
rm .port.dat
231+
232+
# Variables
233+
resource_name="nvidia.com~1gpu"
234+
resource_count="8"
235+
236+
# Patch nodes
237+
for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
238+
do
239+
echo "- Patching node (add): ${node_name}"
240+
241+
patching_status=$(curl -s --header "Content-Type: application/json-patch+json" \
242+
--request PATCH \
243+
--data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
244+
http://localhost:${kube_proxy_port}/api/v1/nodes/${node_name}/status | jq -r '.status')
245+
246+
if [[ ${patching_status} == "Failure" ]]; then
247+
echo "Failed to patch node '${node_name}' with GPU resources"
248+
exit 1
249+
fi
250+
251+
echo "Patching done!"
252+
done
253+
254+
# Stop communication with cluster
255+
echo "Killing proxy (pid=${proxy_pid})..."
256+
kill -9 ${proxy_pid}
257+
}
258+
207259
# clean up
208260
function cleanup {
209261
echo "==========================>>>>> Cleaning up... <<<<<=========================="

hack/kind-config.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ nodes:
77
- role: control-plane
88
# kubernetes version 1.27.13 from kind v0.23.0
99
image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8
10-
# the worker
10+
# the workers
11+
- role: worker
12+
# kubernetes version 1.27.13 from kind v0.23.0
13+
image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8
1114
- role: worker
1215
# kubernetes version 1.27.13 from kind v0.23.0
1316
image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3232
"k8s.io/apimachinery/pkg/runtime"
3333
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/apimachinery/pkg/util/sets"
3435

3536
ctrl "sigs.k8s.io/controller-runtime"
3637
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -64,6 +65,7 @@ type podStatusSummary struct {
6465
succeeded int32
6566
failed int32
6667
terminalFailure bool
68+
unhealthyNodes sets.Set[string]
6769
}
6870

6971
type componentStatusSummary struct {
@@ -299,6 +301,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
299301
}
300302
}
301303

304+
// Initiate migration of workloads that are using resources that Autopilot has flagged as unhealthy
305+
if len(podStatus.unhealthyNodes) > 0 {
306+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
307+
Type: string(workloadv1beta2.Unhealthy),
308+
Status: metav1.ConditionTrue,
309+
Reason: "AutopilotUnhealthy",
310+
Message: fmt.Sprintf("Workload contains pods using unhealthy resources on Nodes: %v", podStatus.unhealthyNodes),
311+
})
312+
// Go to reset directly because an Autopilot triggered evacuation does not count against the retry limit
313+
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperResetting)
314+
}
315+
302316
clearCondition(aw, workloadv1beta2.Unhealthy, "FoundNoFailedPods", "")
303317

304318
if podStatus.running+podStatus.succeeded >= podStatus.expected {
@@ -483,6 +497,7 @@ func (r *AppWrapperReconciler) resetOrFail(ctx context.Context, aw *workloadv1be
483497
}
484498
}
485499

500+
//gocyclo:ignore
486501
func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1beta2.AppWrapper) (*podStatusSummary, error) {
487502
pods := &v1.PodList{}
488503
if err := r.List(ctx, pods,
@@ -497,6 +512,31 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
497512
summary := &podStatusSummary{expected: pc}
498513

499514
for _, pod := range pods.Items {
515+
if len(unhealthyNodes) > 0 {
516+
if resources, ok := unhealthyNodes[pod.Spec.NodeName]; ok {
517+
for badResource := range resources {
518+
for _, container := range pod.Spec.Containers {
519+
if limit, ok := container.Resources.Limits[v1.ResourceName(badResource)]; ok {
520+
if !limit.IsZero() {
521+
if summary.unhealthyNodes == nil {
522+
summary.unhealthyNodes = make(sets.Set[string])
523+
}
524+
summary.unhealthyNodes.Insert(pod.Spec.NodeName)
525+
}
526+
}
527+
if request, ok := container.Resources.Requests[v1.ResourceName(badResource)]; ok {
528+
if !request.IsZero() {
529+
if summary.unhealthyNodes == nil {
530+
summary.unhealthyNodes = make(sets.Set[string])
531+
}
532+
summary.unhealthyNodes.Insert(pod.Spec.NodeName)
533+
}
534+
}
535+
}
536+
}
537+
}
538+
}
539+
500540
switch pod.Status.Phase {
501541
case v1.PodPending:
502542
summary.pending += 1
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
Copyright 2024 IBM Corporation.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package appwrapper
18+
19+
import (
20+
"context"
21+
22+
v1 "k8s.io/api/core/v1"
23+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24+
"k8s.io/apimachinery/pkg/runtime/schema"
25+
"k8s.io/apimachinery/pkg/util/sets"
26+
27+
ctrl "sigs.k8s.io/controller-runtime"
28+
"sigs.k8s.io/controller-runtime/pkg/client"
29+
"sigs.k8s.io/controller-runtime/pkg/handler"
30+
"sigs.k8s.io/controller-runtime/pkg/log"
31+
32+
"github.com/project-codeflare/appwrapper/pkg/config"
33+
)
34+
35+
// NodeHealthMonitor maintains the set of nodes that Autopilot has labelled as unhealthy
36+
type NodeHealthMonitor struct {
37+
client.Client
38+
Config *config.AppWrapperConfig
39+
}
40+
41+
// unhealthyNodes is a mapping from Node names to a set of resources that Autopilot has labeled as unhealthy on that Node
42+
var unhealthyNodes = make(map[string]sets.Set[string])
43+
44+
// permission to watch nodes
45+
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
46+
47+
func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
48+
node := &metav1.PartialObjectMetadata{}
49+
node.SetGroupVersionKind(schema.GroupVersionKind{
50+
Group: "",
51+
Version: "v1",
52+
Kind: "Node",
53+
})
54+
if err := r.Get(ctx, req.NamespacedName, node); err != nil {
55+
return ctrl.Result{}, nil
56+
}
57+
58+
log.FromContext(ctx).V(2).Info("Reconcilling", "node", req.NamespacedName)
59+
60+
flaggedResources := make(sets.Set[string])
61+
for key, value := range node.GetLabels() {
62+
for resource, apLabels := range r.Config.Autopilot.ResourceUnhealthyConfig {
63+
if apValue, ok := apLabels[key]; ok && apValue == value {
64+
flaggedResources.Insert(resource)
65+
}
66+
}
67+
}
68+
69+
hadEntries := len(flaggedResources) > 0
70+
71+
if len(flaggedResources) == 0 {
72+
delete(unhealthyNodes, node.GetName())
73+
} else {
74+
unhealthyNodes[node.GetName()] = flaggedResources
75+
}
76+
77+
if len(unhealthyNodes) == 0 {
78+
if hadEntries {
79+
log.FromContext(ctx).Info("All nodes now healthy")
80+
} else {
81+
log.FromContext(ctx).V(2).Info("All nodes now healthy")
82+
}
83+
} else {
84+
log.FromContext(ctx).Info("Some nodes unhealthy", "number", len(unhealthyNodes), "details", unhealthyNodes)
85+
}
86+
87+
return ctrl.Result{}, nil
88+
}
89+
90+
// SetupWithManager sets up the controller with the Manager.
91+
func (r *NodeHealthMonitor) SetupWithManager(mgr ctrl.Manager) error {
92+
return ctrl.NewControllerManagedBy(mgr).
93+
WatchesMetadata(&v1.Node{}, &handler.EnqueueRequestForObject{}).
94+
Named("NodeMonitor").
95+
Complete(r)
96+
}

pkg/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func NewAppWrapperConfig() *AppWrapperConfig {
103103
InjectAffinity: true,
104104
EvacuateWorkloads: true,
105105
ResourceUnhealthyConfig: map[string]map[string]string{
106-
"nvidia.com/gpu": {"autopilot.ibm.com/health": "ERR"},
106+
"nvidia.com/gpu": {"autopilot.ibm.com/gpuhealth": "ERR"},
107107
},
108108
},
109109
UserRBACAdmissionCheck: true,

pkg/controller/setup.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ func SetupControllers(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error
5050
}
5151
}
5252

53+
if awConfig.Autopilot != nil && awConfig.Autopilot.EvacuateWorkloads {
54+
if err := (&appwrapper.NodeHealthMonitor{
55+
Client: mgr.GetClient(),
56+
Config: awConfig,
57+
}).SetupWithManager(mgr); err != nil {
58+
return fmt.Errorf("node health monitor: %w", err)
59+
}
60+
}
61+
5362
if err := (&appwrapper.AppWrapperReconciler{
5463
Client: mgr.GetClient(),
5564
Scheme: mgr.GetScheme(),

samples/wrapped-gpu-job.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: workload.codeflare.dev/v1beta2
2+
kind: AppWrapper
3+
metadata:
4+
name: sample-gpu-job
5+
labels:
6+
kueue.x-k8s.io/queue-name: user-queue
7+
annotations:
8+
workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
9+
spec:
10+
components:
11+
- template:
12+
apiVersion: batch/v1
13+
kind: Job
14+
metadata:
15+
name: sample-gpu-job
16+
spec:
17+
template:
18+
spec:
19+
restartPolicy: Never
20+
containers:
21+
- name: busybox
22+
image: quay.io/project-codeflare/busybox:1.36
23+
command: ["sh", "-c", "sleep 600"]
24+
resources:
25+
requests:
26+
cpu: 1
27+
nvidia.com/gpu: 4
28+
limits:
29+
nvidia.com/gpu: 4

samples/wrapped-job.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
containers:
2121
- name: busybox
2222
image: quay.io/project-codeflare/busybox:1.36
23-
command: ["sh", "-c", "sleep 30"]
23+
command: ["sh", "-c", "sleep 600"]
2424
resources:
2525
requests:
2626
cpu: 1

samples/wrapped-pod.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
containers:
2121
- name: busybox
2222
image: quay.io/project-codeflare/busybox:1.36
23-
command: ["sh", "-c", "sleep 5"]
23+
command: ["sh", "-c", "sleep 600"]
2424
resources:
2525
requests:
2626
cpu: 1

0 commit comments

Comments
 (0)