@@ -10,12 +10,14 @@ import (
1010 "context"
1111 "fmt"
1212 "sync"
13+ "sync/atomic"
1314 "testing"
1415 "time"
1516
1617 "github.com/siderolabs/go-retry/retry"
1718
1819 "github.com/siderolabs/talos/internal/integration/base"
20+ machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
1921 "github.com/siderolabs/talos/pkg/machinery/client"
2022 "github.com/siderolabs/talos/pkg/machinery/config/machine"
2123)
@@ -71,6 +73,71 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
7173 }
7274}
7375
76+ // TestForcedReboot force-reboots cluster node by node,
77+ // ensuring that the 'cleanup' phase/'stopAllPods' task doesn't run.
78+ func (suite * RebootSuite ) TestForcedReboot () { //nolint:gocyclo
79+ if ! suite .Capabilities ().SupportsReboot {
80+ suite .T ().Skip ("cluster doesn't support reboots" )
81+ }
82+
83+ nodes := suite .DiscoverNodeInternalIPs (suite .ctx )
84+ suite .Require ().NotEmpty (nodes )
85+
86+ for _ , node := range nodes {
87+ suite .T ().Log ("force rebooting node" , node )
88+
89+ nodeCtx := client .WithNodes (suite .ctx , node )
90+
91+ var (
92+ sawStopAllPods atomic.Bool
93+ sawCleanupPhase atomic.Bool
94+ )
95+
96+ // watch events so we can verify graceful teardown did not happen
97+ watchCtx , watchCancel := context .WithCancel (nodeCtx )
98+ eventsCh := make (chan client.EventResult )
99+ suite .Require ().NoError (suite .Client .EventsWatchV2 (watchCtx , eventsCh ))
100+
101+ go func () {
102+ for {
103+ select {
104+ case <- watchCtx .Done ():
105+ return
106+ case ev := <- eventsCh :
107+ if ev .Error != nil {
108+ continue
109+ }
110+
111+ switch msg := ev .Event .Payload .(type ) {
112+ case * machineapi.TaskEvent :
113+ if msg .GetTask () == "stopAllPods" {
114+ sawStopAllPods .Store (true )
115+ }
116+ case * machineapi.PhaseEvent :
117+ if msg .GetPhase () == "cleanup" {
118+ sawCleanupPhase .Store (true )
119+ }
120+ }
121+ }
122+ }
123+ }()
124+
125+ suite .AssertRebooted (
126+ suite .ctx , node , func (nodeCtx context.Context ) error {
127+ return base .IgnoreGRPCUnavailable (suite .Client .Reboot (nodeCtx , client .WithForce ))
128+ }, 10 * time .Minute ,
129+ suite .CleanupFailedPods ,
130+ )
131+
132+ watchCancel ()
133+
134+ suite .Require ().Falsef (sawCleanupPhase .Load (), "cleanup phase must not run during forced reboot" )
135+ suite .Require ().Falsef (sawStopAllPods .Load (), "stopAllPods task must not run during forced reboot" )
136+ }
137+
138+ suite .WaitForBootDone (suite .ctx )
139+ }
140+
74141// TestRebootMultiple reboots a node, issues consequent reboots
75142// reboot should cancel boot sequence, and cancel another reboot.
76143func (suite * RebootSuite ) TestRebootMultiple () {
0 commit comments