From 0a4ff5e98b985c35366f86d22ab171b3c44dc4b9 Mon Sep 17 00:00:00 2001 From: Darius Jahandarie Date: Wed, 2 Jul 2025 23:23:35 +0900 Subject: [PATCH] Limit HPA emergency mode to repeated failures --- .gitignore | 1 + pkg/hpa/service.go | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index c0a7a54c..a25b34c1 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ testbin/* *.swp *.swo *~ +.vscode diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 8574982a..05c34f2d 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -14,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/record" @@ -784,9 +785,43 @@ func (c *Service) IsHpaMetricAvailable(ctx context.Context, currenthpa *v2.Horiz for _, condition := range conditions { if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { - // switch to Emergency mode since no metrics - logger.Info("HPA failed to get resource metrics, switch to emergency mode") - return false + // HPA failed to get resource metrics, but we need to check how many times it failed, because FailedGetResourceMetric is an informational message and can happen for various reasons (including a new pod from recent scale up). + logger.Info("HPA failed to get resource metrics, querying k8s events to see how many FailedGetResourceMetric events are there", "hpa", currenthpa.Name) + + sel := fields.AndSelectors( + fields.OneTermEqualSelector("involvedObject.kind", "HorizontalPodAutoscaler"), + fields.OneTermEqualSelector("involvedObject.name", currenthpa.Name), + fields.OneTermEqualSelector("involvedObject.namespace", currenthpa.Namespace), + fields.OneTermEqualSelector("reason", "FailedGetResourceMetric"), + ) + var evList corev1.EventList + opts := &client.ListOptions{ + Namespace: currenthpa.Namespace, + Raw: &metav1.ListOptions{ + FieldSelector: sel.String(), + }, + } + + if err := c.c.List(ctx, &evList, opts); err != nil { + logger.Info("Failed to get events for HPA to check for FailedGetResourceMetric, switch to emergency mode", "hpa", currenthpa.Name) + return false + } + + // count number of FailedGetResourceMetric events that occured in the last 5 minutes + fiveMinutesAgo := time.Now().Add(-5 * time.Minute) + count := 0 + for _, ev := range evList.Items { + if ev.EventTime.Time.After(fiveMinutesAgo) { + count++ + } + } + + if count > 5 { + logger.Info("HPA failed to get resource metrics over 5 times in the last 5 minutes, switch to emergency mode", "hpa", currenthpa.Name, "failCount", count) + return false + } + + logger.Info("HPA did not fail to get resource metrics enough times to switch to emergency mode", "hpa", currenthpa.Name, "failCount", count) } }