Skip to content

Commit bdd5a66

Browse files
committed
fix: improve OOM controller stability and make test strict on false positives
- Add d_* PSI derivative values to the trigger expression context - Only trigger OOM action while PSI is rising - Make OOM test fail if controller kills a cgroup without stress-ng - Wait for stress-mem to terminate before proceeding with the next tests - Skip OOM test when running with race detector Signed-off-by: Dmitrii Sharshakov <[email protected]>
1 parent 021bbfe commit bdd5a66

File tree

13 files changed

+149
-42
lines changed

13 files changed

+149
-42
lines changed

.github/workflows/ci.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
22
#
3-
# Generated on 2025-10-23T15:10:44Z by kres 46e133d.
3+
# Generated on 2025-10-31T17:08:03Z by kres cd5a938.
44

55
concurrency:
66
group: ${{ github.head_ref || github.run_id }}
@@ -4670,6 +4670,7 @@ jobs:
46704670
make initramfs installer-base imager installer
46714671
- name: e2e-qemu-race
46724672
env:
4673+
EXTRA_TEST_ARGS: -talos.race
46734674
GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-race
46744675
IMAGE_REGISTRY: registry.dev.siderolabs.io
46754676
QEMU_EXTRA_DISKS: "3"

.github/workflows/integration-qemu-race-cron.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
22
#
3-
# Generated on 2025-09-19T11:03:20Z by kres 065ec4c.
3+
# Generated on 2025-10-31T17:08:03Z by kres cd5a938.
44

55
concurrency:
66
group: ${{ github.head_ref || github.run_id }}
@@ -88,6 +88,7 @@ jobs:
8888
make initramfs installer-base imager installer
8989
- name: e2e-qemu-race
9090
env:
91+
EXTRA_TEST_ARGS: -talos.race
9192
GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-race
9293
IMAGE_REGISTRY: registry.dev.siderolabs.io
9394
QEMU_EXTRA_DISKS: "3"

.kres.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2059,6 +2059,7 @@ spec:
20592059
WITH_CONFIG_PATCH_WORKER: "@hack/test/patches/ephemeral-nvme.yaml:@hack/test/patches/dm-raid-module.yaml"
20602060
QEMU_MEMORY_CONTROLPLANES: 4096 # race-enabled Talos consumes lots of RAM
20612061
QEMU_MEMORY_WORKERS: 4096
2062+
EXTRA_TEST_ARGS: "-talos.race"
20622063
TAG_SUFFIX: -race
20632064
IMAGE_REGISTRY: registry.dev.siderolabs.io
20642065
- name: save-talos-logs

internal/app/machined/pkg/controllers/runtime/internal/oom/oom.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"io/fs"
1111
"os"
1212
"path/filepath"
13+
"time"
1314

1415
"github.com/google/cel-go/common/types"
1516
"go.uber.org/zap"
@@ -56,12 +57,7 @@ func (cgroup *RankedCgroup) CalculateScore(expr *cel.Expression) (float64, error
5657

5758
// EvaluateTrigger is a method obtaining data and evaluating the trigger expression.
5859
// When the result is true, designated OOM action is to be executed.
59-
func EvaluateTrigger(triggerExpr cel.Expression, evalContext map[string]any, cgroup string) (bool, error) {
60-
err := PopulatePsiToCtx(cgroup, evalContext)
61-
if err != nil {
62-
return false, fmt.Errorf("cannot populate PSI context: %w", err)
63-
}
64-
60+
func EvaluateTrigger(triggerExpr cel.Expression, evalContext map[string]any) (bool, error) {
6561
trigger, err := triggerExpr.EvalBool(celenv.OOMTrigger(), evalContext)
6662
if err != nil {
6763
return false, fmt.Errorf("cannot evaluate expression: %w", err)
@@ -71,7 +67,7 @@ func EvaluateTrigger(triggerExpr cel.Expression, evalContext map[string]any, cgr
7167
}
7268

7369
// PopulatePsiToCtx populates the context with PSI data from a cgroup.
74-
func PopulatePsiToCtx(cgroup string, evalContext map[string]any) error {
70+
func PopulatePsiToCtx(cgroup string, evalContext map[string]any, psi map[string]float64, sampleInterval time.Duration) error {
7571
node, err := cgroups.GetCgroupProperty(cgroup, "memory.pressure")
7672
if err != nil {
7773
return fmt.Errorf("cannot read memory pressure: %w", err)
@@ -93,7 +89,15 @@ func PopulatePsiToCtx(cgroup string, evalContext map[string]any) error {
9389
return fmt.Errorf("PSI is not defined")
9490
}
9591

92+
diff := 0.
93+
94+
if oldValue, ok := psi["memory_"+psiType+"_"+span]; ok {
95+
diff = (value.Float64() - oldValue) / sampleInterval.Seconds()
96+
}
97+
98+
evalContext["d_memory_"+psiType+"_"+span] = diff
9699
evalContext["memory_"+psiType+"_"+span] = value.Float64()
100+
psi["memory_"+psiType+"_"+span] = value.Float64()
97101
}
98102
}
99103

internal/app/machined/pkg/controllers/runtime/internal/oom/oom_test.go

Lines changed: 52 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -125,29 +125,45 @@ func TestPopulatePsiToCtx(t *testing.T) {
125125
dir: "./testdata/trigger-false",
126126
expectErr: "",
127127
expect: map[string]any{
128-
"memory_full_avg10": 2.4,
129-
"memory_full_avg300": 1.71,
130-
"memory_full_avg60": 5.16,
131-
"memory_full_total": 1.0654831e+07,
132-
"memory_some_avg10": 2.82,
133-
"memory_some_avg300": 1.97,
134-
"memory_some_avg60": 5.95,
135-
"memory_some_total": 1.217234e+07,
128+
"memory_full_avg10": 2.4,
129+
"memory_full_avg300": 1.71,
130+
"memory_full_avg60": 5.16,
131+
"memory_full_total": 1.0654831e+07,
132+
"memory_some_avg10": 2.82,
133+
"memory_some_avg300": 1.97,
134+
"memory_some_avg60": 5.95,
135+
"memory_some_total": 1.217234e+07,
136+
"d_memory_full_avg10": 0.0,
137+
"d_memory_full_avg300": 0.0,
138+
"d_memory_full_avg60": 0.0,
139+
"d_memory_full_total": 0.0,
140+
"d_memory_some_avg10": 0.0,
141+
"d_memory_some_avg300": 0.0,
142+
"d_memory_some_avg60": 0.0,
143+
"d_memory_some_total": 0.0,
136144
},
137145
},
138146
{
139147
name: "true",
140148
dir: "./testdata/trigger-true",
141149
expectErr: "",
142150
expect: map[string]any{
143-
"memory_full_avg10": 14.54,
144-
"memory_full_avg60": 6.97,
145-
"memory_full_avg300": 1.82,
146-
"memory_full_total": 1.0654831e+07,
147-
"memory_some_avg10": 17.06,
148-
"memory_some_avg60": 8.04,
149-
"memory_some_avg300": 2.1,
150-
"memory_some_total": 1.217234e+07,
151+
"memory_full_avg10": 14.54,
152+
"memory_full_avg60": 6.97,
153+
"memory_full_avg300": 1.82,
154+
"memory_full_total": 1.0654831e+07,
155+
"memory_some_avg10": 17.06,
156+
"memory_some_avg60": 8.04,
157+
"memory_some_avg300": 2.1,
158+
"memory_some_total": 1.217234e+07,
159+
"d_memory_full_avg10": 0.0,
160+
"d_memory_full_avg300": 0.0,
161+
"d_memory_full_avg60": 0.0,
162+
"d_memory_full_total": 0.0,
163+
"d_memory_some_avg10": 0.0,
164+
"d_memory_some_avg300": 0.0,
165+
"d_memory_some_avg60": 0.0,
166+
"d_memory_some_total": 0.0,
151167
},
152168
},
153169
} {
@@ -156,7 +172,7 @@ func TestPopulatePsiToCtx(t *testing.T) {
156172

157173
ctx := map[string]any{}
158174

159-
err := oom.PopulatePsiToCtx(test.dir, ctx)
175+
err := oom.PopulatePsiToCtx(test.dir, ctx, make(map[string]float64), 0)
160176

161177
if test.expectErr == "" {
162178
require.NoError(t, err)
@@ -176,6 +192,17 @@ func TestEvaluateTrigger(t *testing.T) {
176192
celenv.OOMTrigger(),
177193
))
178194

195+
zeroPsi := map[string]float64{
196+
"memory_full_avg10": 0,
197+
"memory_full_avg300": 0,
198+
"memory_full_avg60": 0,
199+
"memory_full_total": 0,
200+
"memory_some_avg10": 0,
201+
"memory_some_avg300": 0,
202+
"memory_some_avg60": 0,
203+
"memory_some_total": 0,
204+
}
205+
179206
for _, test := range []struct {
180207
name string
181208
dir string
@@ -192,7 +219,7 @@ func TestEvaluateTrigger(t *testing.T) {
192219
},
193220
triggerExpr: triggerExpr1,
194221
expect: false,
195-
expectErr: "cannot populate PSI context: cannot read memory pressure: error opening cgroupfs file open testdata/empty/memory.pressure: no such file or directory",
222+
expectErr: "cannot read memory pressure: error opening cgroupfs file open testdata/empty/memory.pressure: no such file or directory",
196223
},
197224
{
198225
name: "cgroup-false",
@@ -241,12 +268,16 @@ func TestEvaluateTrigger(t *testing.T) {
241268
t.Run(test.name, func(t *testing.T) {
242269
t.Parallel()
243270

244-
trigger, err := oom.EvaluateTrigger(test.triggerExpr, test.ctx, test.dir)
245-
246-
assert.Equal(t, test.expect, trigger)
271+
err := oom.PopulatePsiToCtx(test.dir, test.ctx, zeroPsi, 0)
247272

248273
if test.expectErr == "" {
249274
require.NoError(t, err)
275+
276+
trigger, err := oom.EvaluateTrigger(test.triggerExpr, test.ctx)
277+
278+
assert.Equal(t, test.expect, trigger)
279+
280+
require.NoError(t, err)
250281
} else {
251282
assert.ErrorContains(t, err, test.expectErr)
252283
}

internal/app/machined/pkg/controllers/runtime/oom.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type OOMController struct {
4646
V1Alpha1Mode runtime.Mode
4747
actionLog []actionLogItem
4848
idSeq int
49+
psi map[string]float64
4950
}
5051

5152
// Name implements controller.Controller interface.
@@ -117,6 +118,7 @@ func (ctrl *OOMController) Run(ctx context.Context, r controller.Runtime, logger
117118
triggerExpr := defaultTriggerExpr()
118119
scoringExpr := defaultScoringExpr()
119120
sampleInterval := defaultSampleInterval
121+
ctrl.psi = make(map[string]float64)
120122

121123
ticker := time.NewTicker(sampleInterval)
122124
tickerC := ticker.C
@@ -150,7 +152,14 @@ func (ctrl *OOMController) Run(ctx context.Context, r controller.Runtime, logger
150152
"time_since_trigger": time.Since(ctrl.ActionTriggered),
151153
}
152154

153-
trigger, err := oom.EvaluateTrigger(triggerExpr, evalContext, ctrl.CgroupRoot)
155+
err := oom.PopulatePsiToCtx(ctrl.CgroupRoot, evalContext, ctrl.psi, sampleInterval)
156+
if err != nil {
157+
logger.Error("cannot populate PSI context", zap.Error(err))
158+
159+
continue
160+
}
161+
162+
trigger, err := oom.EvaluateTrigger(triggerExpr, evalContext)
154163
if err != nil {
155164
logger.Error("cannot evaluate OOM trigger expression", zap.Error(err))
156165

internal/integration/base/base.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ type TalosSuite struct {
6262
CSITestTimeout string
6363
// Airgapped marks that cluster has no access to external networks
6464
Airgapped bool
65+
// Race informs test suites about race detector being enabled (e.g. for skipping incompatible tests)
66+
Race bool
6567

6668
discoveredNodes cluster.Info
6769
}

internal/integration/integration_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ var (
4141
extensionsNvidia bool
4242
verifyUKIBooted bool
4343
airgapped bool
44+
race bool
4445

4546
talosConfig string
4647
endpoint string
@@ -118,6 +119,7 @@ func TestIntegration(t *testing.T) {
118119
CSITestName: csiTestName,
119120
CSITestTimeout: csiTestTimeout,
120121
Airgapped: airgapped,
122+
Race: race,
121123
})
122124
}
123125

@@ -151,6 +153,7 @@ func init() {
151153
flag.BoolVar(&selinuxEnforcing, "talos.enforcing", false, "enable tests for SELinux enforcing mode")
152154
flag.BoolVar(&extensionsQEMU, "talos.extensions.qemu", false, "enable tests for qemu extensions")
153155
flag.BoolVar(&extensionsNvidia, "talos.extensions.nvidia", false, "enable tests for nvidia extensions")
156+
flag.BoolVar(&race, "talos.race", false, "skip tests that are incompatible with race detector")
154157
flag.BoolVar(&verifyUKIBooted, "talos.verifyukibooted", true, "enable tests for verifying that Talos was booted using a UKI")
155158

156159
flag.StringVar(

0 commit comments

Comments
 (0)