@@ -17,6 +17,7 @@ import (
1717 "github.com/dustin/go-humanize"
1818 "github.com/stretchr/testify/require"
1919 "go.yaml.in/yaml/v4"
20+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021
2122 "github.com/siderolabs/talos/internal/integration/base"
2223 "github.com/siderolabs/talos/pkg/machinery/client"
@@ -39,8 +40,6 @@ func (suite *OomSuite) SuiteName() string {
3940
4041// TestOom verifies that system remains stable after handling an OOM event.
4142func (suite * OomSuite ) TestOom () {
42- suite .T ().Skip ("skip the test until https://github.com/siderolabs/talos/issues/12077 is resolved" )
43-
4443 if suite .Cluster == nil {
4544 suite .T ().Skip ("without full cluster state reaching out to the node IP is not reliable" )
4645 }
@@ -49,6 +48,10 @@ func (suite *OomSuite) TestOom() {
4948 suite .T ().Skip ("skipping in short mode" )
5049 }
5150
51+ if suite .Race {
52+ suite .T ().Skip ("skipping as OOM tests are incompatible with race detector" )
53+ }
54+
5255 if suite .Cluster .Provisioner () != base .ProvisionerQEMU {
5356 suite .T ().Skip ("skipping OOM test since provisioner is not qemu" )
5457 }
@@ -59,10 +62,33 @@ func (suite *OomSuite) TestOom() {
5962 oomPodManifest := suite .ParseManifests (oomPodSpec )
6063
6164 suite .T ().Cleanup (func () {
62- cleanUpCtx , cleanupCancel := context .WithTimeout (context .Background (), time .Minute )
65+ cleanUpCtx , cleanupCancel := context .WithTimeout (context .Background (), 2 * time .Minute )
6366 defer cleanupCancel ()
6467
6568 suite .DeleteManifests (cleanUpCtx , oomPodManifest )
69+
70+ ticker := time .NewTicker (time .Second )
71+ done := cleanUpCtx .Done ()
72+
73+ // Wait for all stress-mem pods to complete terminating
74+ for {
75+ select {
76+ case <- ticker .C :
77+ pods , err := suite .Clientset .CoreV1 ().Pods ("default" ).List (ctx , metav1.ListOptions {
78+ LabelSelector : "app=stress-mem" ,
79+ })
80+
81+ suite .Require ().NoError (err )
82+
83+ if len (pods .Items ) == 0 {
84+ return
85+ }
86+ case <- done :
87+ suite .Require ().Fail ("Timed out waiting for cleanup" )
88+
89+ return
90+ }
91+ }
6692 })
6793
6894 suite .ApplyManifests (ctx , oomPodManifest )
@@ -77,7 +103,7 @@ func (suite *OomSuite) TestOom() {
77103 suite .Require ().NoError (err )
78104
79105 memoryBytes := memInfo .GetMessages ()[0 ].GetMeminfo ().GetMemtotal () * 1024
80- numReplicas := int ((memoryBytes / 1024 / 1024 + 2048 - 1 )/ 2048 ) * numWorkers * 15
106+ numReplicas := int ((memoryBytes / 1024 / 1024 + 2048 - 1 )/ 2048 ) * numWorkers * 25
81107
82108 suite .T ().Logf ("detected memory: %s, workers %d => scaling to %d replicas" ,
83109 humanize .IBytes (memoryBytes ), numWorkers , numReplicas )
@@ -86,12 +112,16 @@ func (suite *OomSuite) TestOom() {
86112 suite .PatchK8sObject (ctx , "default" , "apps" , "Deployment" , "v1" , "stress-mem" , patchToReplicas (suite .T (), numReplicas ))
87113
88114 // Expect at least one OOM kill of stress-ng within 15 seconds
89- suite .Assert ().True (suite .waitForOOMKilled (ctx , 15 * time .Second , 2 * time .Minute , "stress-ng" ))
115+ suite .Assert ().True (suite .waitForOOMKilled (ctx , 15 * time .Second , 2 * time .Minute , "stress-ng" , 1 ))
90116
91117 // Scale to 1, wait for deployment to scale down, proving system is operational
92118 suite .PatchK8sObject (ctx , "default" , "apps" , "Deployment" , "v1" , "stress-mem" , patchToReplicas (suite .T (), 1 ))
93119 suite .Require ().NoError (suite .WaitForDeploymentAvailable (ctx , time .Minute , "default" , "stress-mem" , 1 ))
94120
121+ // Monitor OOM kills for 15 seconds and make sure no kills other than stress-ng happen
122+ // Allow 0 as well: ideally that'd be the case, but fail on anything not containing stress-ng
123+ suite .Assert ().True (suite .waitForOOMKilled (ctx , 15 * time .Second , 2 * time .Minute , "stress-ng" , 0 ))
124+
95125 suite .APISuite .AssertClusterHealthy (ctx )
96126}
97127
@@ -111,7 +141,7 @@ func patchToReplicas(t *testing.T, replicas int) []byte {
111141// Waits for a period of time and return returns whether or not OOM events containing a specified process have been observed.
112142//
113143//nolint:gocyclo
114- func (suite * OomSuite ) waitForOOMKilled (ctx context.Context , timeToObserve , timeout time.Duration , substr string ) bool {
144+ func (suite * OomSuite ) waitForOOMKilled (ctx context.Context , timeToObserve , timeout time.Duration , substr string , n int ) bool {
115145 startTime := time .Now ()
116146
117147 watchCh := make (chan state.Event )
@@ -135,9 +165,9 @@ func (suite *OomSuite) waitForOOMKilled(ctx context.Context, timeToObserve, time
135165 case <- timeoutCh :
136166 suite .T ().Logf ("observed %d OOM events containing process substring %q" , numOOMObserved , substr )
137167
138- return numOOMObserved > 0
168+ return numOOMObserved >= n
139169 case <- timeToObserveCh :
140- if numOOMObserved > 0 {
170+ if numOOMObserved >= n {
141171 // if we already observed some OOM events, consider it a success
142172 suite .T ().Logf ("observed %d OOM events containing process substring %q" , numOOMObserved , substr )
143173
@@ -150,10 +180,27 @@ func (suite *OomSuite) waitForOOMKilled(ctx context.Context, timeToObserve, time
150180
151181 res := ev .Resource .(* runtime.OOMAction ).TypedSpec ()
152182
183+ bailOut := false
184+
153185 for _ , proc := range res .Processes {
154186 if strings .Contains (proc , substr ) {
155187 numOOMObserved ++
188+
189+ break
156190 }
191+
192+ // Sometimes OOM catches containers in restart phase (while the
193+ // cgroup has previously accumulated OOM score).
194+ // Consider an OOM event wrong if something other than that is found.
195+ if ! strings .Contains (proc , "runc init" ) && ! strings .Contains (proc , "/pause" ) && proc != "" {
196+ bailOut = true
197+ }
198+ }
199+
200+ if bailOut {
201+ suite .T ().Logf ("observed an OOM event not containing process substring %q: %q (%d containing)" , substr , res .Processes , numOOMObserved )
202+
203+ return false
157204 }
158205 }
159206 }
0 commit comments