Autopilot: Automatic reset/resume of AppWrappers whose pods are using unhealthy resources (#199)

dgrove-oss · web-flow · commit dea20c033a30 · 2024-07-18T14:49:46.000-04:00
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -13,6 +13,14 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
 - apiGroups:
   - ""
   resources:
diff --git a/hack/create-test-cluster.sh b/hack/create-test-cluster.sh
@@ -24,4 +24,5 @@ update_test_host
 check_prerequisites
 pull_images
 kind_up_cluster
+add_virtual_GPUs
 configure_cluster
diff --git a/hack/default-queues.yaml b/hack/default-queues.yaml
@@ -10,12 +10,14 @@ metadata:
 spec:
   namespaceSelector: {} # match all.
   resourceGroups:
-  - coveredResources: ["cpu"]
+  - coveredResources: ["cpu", "nvidia.com/gpu"]
     flavors:
     - name: "default-flavor"
       resources:
       - name: "cpu"
-        nominalQuota: 4
+        nominalQuota: 8
+      - name: "nvidia.com/gpu"
+        nominalQuota: 16
 ---
 apiVersion: kueue.x-k8s.io/v1beta1
 kind: LocalQueue
diff --git a/hack/e2e-util.sh b/hack/e2e-util.sh
@@ -204,6 +204,58 @@ function wait_for_appwrapper_controller {
     echo ""
 }
 
+function add_virtual_GPUs {
+    # Patch nodes to provide GPUs resources without physical GPUs.
+    # This is intended to allow testing of our autopilot integration.
+
+    # Start communication with cluster
+    kubectl proxy --port=0 > .port.dat 2>&1 &
+    proxy_pid=$!
+
+    echo "Starting background proxy connection (pid=${proxy_pid})..."
+    echo "Waiting for proxy process to start."
+    sleep 5
+
+    kube_proxy_port=$(cat .port.dat | awk '{split($5, substrings, ":"); print substrings[2]}')
+    curl -s 127.0.0.1:${kube_proxy_port} > /dev/null 2>&1
+
+    if [[ ! $? -eq 0 ]]; then
+        echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
+        kill -9 ${proxy_pid}
+        exit 1
+    else
+        echo "Connected to the kubelet for patching the nodes. Using port ${kube_proxy_port}."
+    fi
+
+    rm .port.dat
+
+    # Variables
+    resource_name="nvidia.com~1gpu"
+    resource_count="8"
+
+    # Patch nodes
+    for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
+    do
+        echo "- Patching node (add): ${node_name}"
+
+        patching_status=$(curl -s --header "Content-Type: application/json-patch+json" \
+                                --request PATCH \
+                                --data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
+                                http://localhost:${kube_proxy_port}/api/v1/nodes/${node_name}/status | jq -r '.status')
+
+        if [[ ${patching_status} == "Failure" ]]; then
+            echo "Failed to patch node '${node_name}' with GPU resources"
+            exit 1
+        fi
+
+        echo "Patching done!"
+    done
+
+    # Stop communication with cluster
+    echo "Killing proxy (pid=${proxy_pid})..."
+    kill -9 ${proxy_pid}
+}
+
 # clean up
 function cleanup {
     echo "==========================>>>>> Cleaning up... <<<<<=========================="
diff --git a/hack/kind-config.yaml b/hack/kind-config.yaml
@@ -7,7 +7,10 @@ nodes:
 - role: control-plane
   # kubernetes version 1.27.13 from kind v0.23.0
   image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8
-# the worker
+# the workers
+- role: worker
+  # kubernetes version 1.27.13 from kind v0.23.0
+  image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8
 - role: worker
   # kubernetes version 1.27.13 from kind v0.23.0
   image: kindest/node:v1.27.13@sha256:17439fa5b32290e3ead39ead1250dca1d822d94a10d26f1981756cd51b24b9d8
diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go
@@ -31,6 +31,7 @@ import (
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/sets"
 
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -64,6 +65,7 @@ type podStatusSummary struct {
 	succeeded       int32
 	failed          int32
 	terminalFailure bool
+	unhealthyNodes  sets.Set[string]
 }
 
 type componentStatusSummary struct {
@@ -299,6 +301,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			}
 		}
 
+		// Initiate migration of workloads that are using resources that Autopilot has flagged as unhealthy
+		if len(podStatus.unhealthyNodes) > 0 {
+			meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
+				Type:    string(workloadv1beta2.Unhealthy),
+				Status:  metav1.ConditionTrue,
+				Reason:  "AutopilotUnhealthy",
+				Message: fmt.Sprintf("Workload contains pods using unhealthy resources on Nodes: %v", podStatus.unhealthyNodes),
+			})
+			// Go to reset directly because an Autopilot triggered evacuation does not count against the retry limit
+			return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperResetting)
+		}
+
 		clearCondition(aw, workloadv1beta2.Unhealthy, "FoundNoFailedPods", "")
 
 		if podStatus.running+podStatus.succeeded >= podStatus.expected {
@@ -483,6 +497,7 @@ func (r *AppWrapperReconciler) resetOrFail(ctx context.Context, aw *workloadv1be
 	}
 }
 
+//gocyclo:ignore
 func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1beta2.AppWrapper) (*podStatusSummary, error) {
 	pods := &v1.PodList{}
 	if err := r.List(ctx, pods,
@@ -497,6 +512,31 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
 	summary := &podStatusSummary{expected: pc}
 
 	for _, pod := range pods.Items {
+		if len(unhealthyNodes) > 0 {
+			if resources, ok := unhealthyNodes[pod.Spec.NodeName]; ok {
+				for badResource := range resources {
+					for _, container := range pod.Spec.Containers {
+						if limit, ok := container.Resources.Limits[v1.ResourceName(badResource)]; ok {
+							if !limit.IsZero() {
+								if summary.unhealthyNodes == nil {
+									summary.unhealthyNodes = make(sets.Set[string])
+								}
+								summary.unhealthyNodes.Insert(pod.Spec.NodeName)
+							}
+						}
+						if request, ok := container.Resources.Requests[v1.ResourceName(badResource)]; ok {
+							if !request.IsZero() {
+								if summary.unhealthyNodes == nil {
+									summary.unhealthyNodes = make(sets.Set[string])
+								}
+								summary.unhealthyNodes.Insert(pod.Spec.NodeName)
+							}
+						}
+					}
+				}
+			}
+		}
+
 		switch pod.Status.Phase {
 		case v1.PodPending:
 			summary.pending += 1
diff --git a/internal/controller/appwrapper/node_health_monitor.go b/internal/controller/appwrapper/node_health_monitor.go
@@ -0,0 +1,96 @@
+/*
+Copyright 2024 IBM Corporation.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package appwrapper
+
+import (
+	"context"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/apimachinery/pkg/util/sets"
+
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/handler"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+
+	"github.com/project-codeflare/appwrapper/pkg/config"
+)
+
+// NodeHealthMonitor maintains the set of nodes that Autopilot has labelled as unhealthy
+type NodeHealthMonitor struct {
+	client.Client
+	Config *config.AppWrapperConfig
+}
+
+// unhealthyNodes is a mapping from Node names to a set of resources that Autopilot has labeled as unhealthy on that Node
+var unhealthyNodes = make(map[string]sets.Set[string])
+
+// permission to watch nodes
+//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
+
+func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	node := &metav1.PartialObjectMetadata{}
+	node.SetGroupVersionKind(schema.GroupVersionKind{
+		Group:   "",
+		Version: "v1",
+		Kind:    "Node",
+	})
+	if err := r.Get(ctx, req.NamespacedName, node); err != nil {
+		return ctrl.Result{}, nil
+	}
+
+	log.FromContext(ctx).V(2).Info("Reconcilling", "node", req.NamespacedName)
+
+	flaggedResources := make(sets.Set[string])
+	for key, value := range node.GetLabels() {
+		for resource, apLabels := range r.Config.Autopilot.ResourceUnhealthyConfig {
+			if apValue, ok := apLabels[key]; ok && apValue == value {
+				flaggedResources.Insert(resource)
+			}
+		}
+	}
+
+	hadEntries := len(flaggedResources) > 0
+
+	if len(flaggedResources) == 0 {
+		delete(unhealthyNodes, node.GetName())
+	} else {
+		unhealthyNodes[node.GetName()] = flaggedResources
+	}
+
+	if len(unhealthyNodes) == 0 {
+		if hadEntries {
+			log.FromContext(ctx).Info("All nodes now healthy")
+		} else {
+			log.FromContext(ctx).V(2).Info("All nodes now healthy")
+		}
+	} else {
+		log.FromContext(ctx).Info("Some nodes unhealthy", "number", len(unhealthyNodes), "details", unhealthyNodes)
+	}
+
+	return ctrl.Result{}, nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *NodeHealthMonitor) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		WatchesMetadata(&v1.Node{}, &handler.EnqueueRequestForObject{}).
+		Named("NodeMonitor").
+		Complete(r)
+}
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -103,7 +103,7 @@ func NewAppWrapperConfig() *AppWrapperConfig {
 			InjectAffinity:    true,
 			EvacuateWorkloads: true,
 			ResourceUnhealthyConfig: map[string]map[string]string{
-				"nvidia.com/gpu": {"autopilot.ibm.com/health": "ERR"},
+				"nvidia.com/gpu": {"autopilot.ibm.com/gpuhealth": "ERR"},
 			},
 		},
 		UserRBACAdmissionCheck: true,
diff --git a/pkg/controller/setup.go b/pkg/controller/setup.go
@@ -50,6 +50,15 @@ func SetupControllers(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error
 		}
 	}
 
+	if awConfig.Autopilot != nil && awConfig.Autopilot.EvacuateWorkloads {
+		if err := (&appwrapper.NodeHealthMonitor{
+			Client: mgr.GetClient(),
+			Config: awConfig,
+		}).SetupWithManager(mgr); err != nil {
+			return fmt.Errorf("node health monitor: %w", err)
+		}
+	}
+
 	if err := (&appwrapper.AppWrapperReconciler{
 		Client: mgr.GetClient(),
 		Scheme: mgr.GetScheme(),
diff --git a/samples/wrapped-gpu-job.yaml b/samples/wrapped-gpu-job.yaml
@@ -0,0 +1,29 @@
+apiVersion: workload.codeflare.dev/v1beta2
+kind: AppWrapper
+metadata:
+  name: sample-gpu-job
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue
+  annotations:
+    workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
+spec:
+  components:
+  - template:
+      apiVersion: batch/v1
+      kind: Job
+      metadata:
+        name: sample-gpu-job
+      spec:
+        template:
+          spec:
+            restartPolicy: Never
+            containers:
+            - name: busybox
+              image: quay.io/project-codeflare/busybox:1.36
+              command: ["sh", "-c", "sleep 600"]
+              resources:
+                requests:
+                  cpu: 1
+                  nvidia.com/gpu: 4
+                limits:
+                  nvidia.com/gpu: 4
diff --git a/samples/wrapped-job.yaml b/samples/wrapped-job.yaml
@@ -20,7 +20,7 @@ spec:
             containers:
             - name: busybox
               image: quay.io/project-codeflare/busybox:1.36
-              command: ["sh", "-c", "sleep 30"]
+              command: ["sh", "-c", "sleep 600"]
               resources:
                 requests:
                   cpu: 1
diff --git a/samples/wrapped-pod.yaml b/samples/wrapped-pod.yaml
@@ -20,7 +20,7 @@ spec:
         containers:
         - name: busybox
           image: quay.io/project-codeflare/busybox:1.36
-          command: ["sh", "-c", "sleep 5"]
+          command: ["sh", "-c", "sleep 600"]
           resources:
             requests:
               cpu: 1