From d1251ad7a8909c5943ce7fea48a9ef8abc3162c4 Mon Sep 17 00:00:00 2001 From: weliang1 Date: Wed, 22 Oct 2025 15:33:58 -0400 Subject: [PATCH] [WIP]:Check operators state when Pod was deleted --- .../legacynetworkmonitortests/networking.go | 61 ++++++++++++------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/pkg/monitortests/network/legacynetworkmonitortests/networking.go b/pkg/monitortests/network/legacynetworkmonitortests/networking.go index 0871f0cc158d..f38d2cfc1955 100644 --- a/pkg/monitortests/network/legacynetworkmonitortests/networking.go +++ b/pkg/monitortests/network/legacynetworkmonitortests/networking.go @@ -182,38 +182,43 @@ func testPodSandboxCreation(events monitorapi.Intervals, clientConfig *rest.Conf } partialLocator := monitorapi.NonUniquePodLocatorFrom(event.Locator) + progressingOperatorName := getProgressingOperatorName(event, operatorsProgressing) + if deletionTime := getPodDeletionTime(eventsForPods[partialLocator], event.Locator); deletionTime == nil { - var progressingOperatorName string - for _, operatorProgressingInterval := range operatorsProgressing { - if event.From.After(operatorProgressingInterval.From) && - event.To.Before(operatorProgressingInterval.To) { - progressingOperatorName = operatorProgressingInterval.Locator.Keys[monitorapi.LocatorClusterOperatorKey] - break - } - } + // Pod was never deleted if len(progressingOperatorName) > 0 { flakes = append(flakes, fmt.Sprintf( "%v - never deleted - operator:%s was progressing which may cause pod sandbox creation errors - %v", event.Locator.OldLocator(), progressingOperatorName, event.Message.OldMessage())) } else { failures = append(failures, fmt.Sprintf( - "%v - never deleted - operator:%s was progressing which may cause pod sandbox creation errors - %v", - event.Locator.OldLocator(), progressingOperatorName, event.Message.OldMessage())) + "%v - never deleted - %v", + event.Locator.OldLocator(), event.Message.OldMessage())) } } else { + // Pod was deleted - check timing and operator status timeBetweenDeleteAndFailure := event.From.Sub(*deletionTime) - switch { - case timeBetweenDeleteAndFailure < 1*time.Second: - // nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know - case timeBetweenDeleteAndFailure < 5*time.Second: - // withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail - flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage())) - case deletionTime.Before(event.From): - // something went wrong. More than five seconds after the pod ws deleted, the CNI is trying to set up pod sandboxes and can't - failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage())) - default: - // something went wrong. deletion happend after we had a failure to create the pod sandbox - failures = append(failures, fmt.Sprintf("%v - deletion came AFTER sandbox failure - %v", event.Locator.OldLocator(), event.Message.OldMessage())) + + if len(progressingOperatorName) > 0 { + // If an operator was progressing, treat as flake regardless of timing + flakes = append(flakes, fmt.Sprintf( + "%v - %0.2f seconds after deletion - operator:%s was progressing which may cause pod sandbox creation errors - %v", + event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), progressingOperatorName, event.Message.OldMessage())) + } else { + // No operator progressing, apply timing-based logic + switch { + case timeBetweenDeleteAndFailure < 1*time.Second: + // nothing here, one second is close enough to be ok, the kubelet and CNI just didn't know + case timeBetweenDeleteAndFailure < 5*time.Second: + // withing five seconds, it ought to be long enough to know, but it's close enough to flake and not fail + flakes = append(flakes, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage())) + case deletionTime.Before(event.From): + // something went wrong. More than five seconds after the pod was deleted, the CNI is trying to set up pod sandboxes and can't + failures = append(failures, fmt.Sprintf("%v - %0.2f seconds after deletion - %v", event.Locator.OldLocator(), timeBetweenDeleteAndFailure.Seconds(), event.Message.OldMessage())) + default: + // something went wrong. deletion happened after we had a failure to create the pod sandbox + failures = append(failures, fmt.Sprintf("%v - deletion came AFTER sandbox failure - %v", event.Locator.OldLocator(), event.Message.OldMessage())) + } } } } @@ -318,6 +323,18 @@ func getPodDeletionTime(events monitorapi.Intervals, podLocator monitorapi.Locat return nil } +// getProgressingOperatorName checks if an event occurred during any operator's Progressing interval +// and returns the name of the progressing operator, or empty string if none found. +func getProgressingOperatorName(event monitorapi.Interval, operatorsProgressing monitorapi.Intervals) string { + for _, operatorProgressingInterval := range operatorsProgressing { + if event.From.After(operatorProgressingInterval.From) && + event.To.Before(operatorProgressingInterval.To) { + return operatorProgressingInterval.Locator.Keys[monitorapi.LocatorClusterOperatorKey] + } + } + return "" +} + // bug is tracked here: https://bugzilla.redhat.com/show_bug.cgi?id=2057181 // It was closed working as designed. func testOvnNodeReadinessProbe(events monitorapi.Intervals, kubeClientConfig *rest.Config) []*junitapi.JUnitTestCase {