Skip to content

Commit 1f9796c

Browse files
authored
node monitoring: correctly recognize NotReady Nodes as unscheduable (#345)
1 parent 9df8e9e commit 1f9796c

File tree

2 files changed

+77
-2
lines changed

2 files changed

+77
-2
lines changed

internal/controller/appwrapper/node_health_monitor.go

+15-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ var (
5252
// noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources.
5353
// A resource may be unschedulable either because:
5454
// (a) the Node is cordoned (node.Spec.Unschedulable is true) or
55-
// (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource.
55+
// (b) the Node has been marked as NotReady by Kubernetes or
56+
// (c) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource.
5657
noScheduleNodes = make(map[string]v1.ResourceList)
5758
// noScheduleNodesMutex synchronizes access to noScheduleNodes
5859
noScheduleNodesMutex sync.RWMutex
@@ -136,7 +137,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N
136137
// update noScheduleNodes entry for node
137138
func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.Node) {
138139
var noScheduleResources v1.ResourceList
139-
if node.Spec.Unschedulable {
140+
if r.nodeIsUnscheduable(node) {
140141
noScheduleResources = node.Status.Capacity.DeepCopy()
141142
delete(noScheduleResources, v1.ResourcePods)
142143
} else {
@@ -178,6 +179,18 @@ func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.
178179
}
179180
}
180181

182+
func (r *NodeHealthMonitor) nodeIsUnscheduable(node *v1.Node) bool {
183+
if node.Spec.Unschedulable {
184+
return true
185+
}
186+
for _, taint := range node.Spec.Taints {
187+
if taint.Key == "node.kubernetes.io/unreachable" || taint.Key == "node.kubernetes.io/not-ready" {
188+
return true
189+
}
190+
}
191+
return false
192+
}
193+
181194
// SetupWithManager sets up the controller with the Manager.
182195
func (r *NodeHealthMonitor) SetupWithManager(mgr ctrl.Manager) error {
183196
return ctrl.NewControllerManagedBy(mgr).

internal/controller/appwrapper/node_health_monitor_test.go

+62
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,13 @@ var _ = Describe("NodeMonitor Controller", func() {
4343
Expect(k8sClient.Create(ctx, node)).To(Succeed())
4444
node = getNode(nodeName)
4545
node.Status.Capacity = nodeGPUs
46+
node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
47+
Type: v1.NodeReady,
48+
Status: v1.ConditionTrue,
49+
})
4650
Expect(k8sClient.Status().Update(ctx, node)).To(Succeed())
51+
node.Spec.Taints = []v1.Taint{}
52+
Expect(k8sClient.Update(ctx, node)).To(Succeed())
4753
}
4854

4955
deleteNode := func(nodeName string) {
@@ -106,6 +112,62 @@ var _ = Describe("NodeMonitor Controller", func() {
106112
Expect(err).NotTo(HaveOccurred())
107113
Expect(noExecuteNodes).Should(BeEmpty())
108114

115+
By("A Node tainted as unreachable is detected as unscheduable")
116+
node = getNode(node1Name.Name)
117+
node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/unreachable", Effect: v1.TaintEffectNoExecute})
118+
Expect(k8sClient.Update(ctx, node)).Should(Succeed())
119+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
120+
Expect(err).NotTo(HaveOccurred())
121+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
122+
Expect(err).NotTo(HaveOccurred())
123+
Expect(noScheduleNodes).Should(HaveLen(1))
124+
Expect(noScheduleNodes).Should(HaveKey(node1Name.Name))
125+
Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu")))
126+
127+
By("Repeated reconcile does not change map")
128+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
129+
Expect(err).NotTo(HaveOccurred())
130+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
131+
Expect(err).NotTo(HaveOccurred())
132+
Expect(noScheduleNodes).Should(HaveLen(1))
133+
Expect(noScheduleNodes).Should(HaveKey(node1Name.Name))
134+
Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu")))
135+
136+
By("Removing the taint updates unhealthyNodes")
137+
node.Spec.Taints = []v1.Taint{}
138+
Expect(k8sClient.Update(ctx, node)).Should(Succeed())
139+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
140+
Expect(err).NotTo(HaveOccurred())
141+
Expect(noScheduleNodes).Should(BeEmpty())
142+
143+
By("A Node tainted as not-read is detected as unscheduable")
144+
node = getNode(node1Name.Name)
145+
node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/not-ready", Effect: v1.TaintEffectNoExecute})
146+
Expect(k8sClient.Update(ctx, node)).Should(Succeed())
147+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
148+
Expect(err).NotTo(HaveOccurred())
149+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
150+
Expect(err).NotTo(HaveOccurred())
151+
Expect(noScheduleNodes).Should(HaveLen(1))
152+
Expect(noScheduleNodes).Should(HaveKey(node1Name.Name))
153+
Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu")))
154+
155+
By("Repeated reconcile does not change map")
156+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
157+
Expect(err).NotTo(HaveOccurred())
158+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
159+
Expect(err).NotTo(HaveOccurred())
160+
Expect(noScheduleNodes).Should(HaveLen(1))
161+
Expect(noScheduleNodes).Should(HaveKey(node1Name.Name))
162+
Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu")))
163+
164+
By("Removing the taint updates unhealthyNodes")
165+
node.Spec.Taints = []v1.Taint{}
166+
Expect(k8sClient.Update(ctx, node)).Should(Succeed())
167+
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
168+
Expect(err).NotTo(HaveOccurred())
169+
Expect(noScheduleNodes).Should(BeEmpty())
170+
109171
deleteNode(node1Name.Name)
110172
deleteNode(node2Name.Name)
111173
})

0 commit comments

Comments
 (0)