Skip to content

Commit 8e9d12f

Browse files
committed
feat(machined): add panic/force mode reboot
In certain situations, Talos's shutdown/reboot sequence hangs while waiting for services/mounts to be gracefully stopped (see: #11775). This patch adds a forceful mode to the reboot sequence (`talosctl reboot --mode force`) that bypasses graceful userspace teardown and hard reboots the machine. Signed-off-by: Laura Brehm <[email protected]>
1 parent eccb21d commit 8e9d12f

File tree

12 files changed

+121
-17
lines changed

12 files changed

+121
-17
lines changed

api/machine/machine.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ message RebootRequest {
143143
enum Mode {
144144
DEFAULT = 0;
145145
POWERCYCLE = 1;
146+
FORCE = 2;
146147
}
147148
Mode mode = 1;
148149
}

cmd/talosctl/cmd/talos/reboot.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ var rebootCmd = &cobra.Command{
3939
// skips kexec and reboots with power cycle
4040
case "powercycle":
4141
opts = append(opts, client.WithPowerCycle)
42+
case "force":
43+
opts = append(opts, client.WithForce)
4244
case "default":
4345
default:
4446
return fmt.Errorf("invalid reboot mode: %q", rebootCmdFlags.mode)

cmd/talosctl/pkg/talos/action/node.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,12 @@ func (a *nodeTracker) handleEvent(event client.Event) error {
286286
Status: reporter.StatusRunning,
287287
})
288288

289+
// TODO: if we first do stopAllServices, that's already handled
290+
// and we don't need special handling for the reboot "error" case.
291+
if msg.GetSequence() == "reboot" {
292+
return retry.ExpectedErrorf("reboot sequence completed")
293+
}
294+
289295
if errStr != "" {
290296
return fmt.Errorf("sequence error: %s", msg.GetError().GetMessage())
291297
}

internal/app/machined/pkg/runtime/sequencer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ type Sequencer interface {
147147
Boot(Runtime) []Phase
148148
Initialize(Runtime) []Phase
149149
Install(Runtime) []Phase
150-
Reboot(Runtime) []Phase
150+
Reboot(Runtime, *machine.RebootRequest) []Phase
151151
Reset(Runtime, ResetOptions) []Phase
152152
Shutdown(Runtime, *machine.ShutdownRequest) []Phase
153153
StageUpgrade(Runtime, *machine.UpgradeRequest) []Phase

internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,19 @@ func (c *Controller) phases(seq runtime.Sequence, data any) ([]runtime.Phase, er
381381

382382
phases = c.s.Shutdown(c.r, in)
383383
case runtime.SequenceReboot:
384-
phases = c.s.Reboot(c.r)
384+
// TODO: we're both using *machine.XxxRequest and
385+
// runtime.XxxOptions types here. Is one of these preferred?
386+
// Might be kind of confusing to be passing a request here
387+
// when the request also gets passed to the specific tasks
388+
// via controller.runTask().
389+
var in *machine.RebootRequest
390+
if req, ok := data.(*machine.RebootRequest); ok {
391+
// TODO: complain if missing? are there older clients
392+
// that might not send a `machine.RebootRequest`?
393+
in = req
394+
}
395+
396+
phases = c.s.Reboot(c.r, in)
385397
case runtime.SequenceUpgrade:
386398
in, ok := data.(*machine.UpgradeRequest)
387399
if !ok {

internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func (m *mockSequencer) Install(r runtime.Runtime) []runtime.Phase {
4444
return m.phases[runtime.SequenceInstall]
4545
}
4646

47-
func (m *mockSequencer) Reboot(r runtime.Runtime) []runtime.Phase {
47+
func (m *mockSequencer) Reboot(r runtime.Runtime, req *machine.RebootRequest) []runtime.Phase {
4848
return m.phases[runtime.SequenceReboot]
4949
}
5050

internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -233,18 +233,22 @@ func (*Sequencer) Boot(r runtime.Runtime) []runtime.Phase {
233233
}
234234

235235
// Reboot is the reboot sequence.
236-
func (*Sequencer) Reboot(r runtime.Runtime) []runtime.Phase {
237-
phases := PhaseList{}.Append(
238-
"cleanup",
239-
StopAllPods,
240-
).Append(
241-
"dbus",
242-
StopDBus,
243-
).
244-
AppendList(stopAllPhaselist(r, true)).
245-
Append("reboot", Reboot)
236+
func (*Sequencer) Reboot(r runtime.Runtime, in *machineapi.RebootRequest) []runtime.Phase {
237+
if in.GetMode() == machineapi.RebootRequest_FORCE {
238+
return PhaseList{}.
239+
Append("reboot", Reboot)
240+
}
246241

247-
return phases
242+
return PhaseList{}.
243+
Append(
244+
"cleanup",
245+
StopAllPods,
246+
).
247+
Append(
248+
"dbus",
249+
StopDBus,
250+
).
251+
AppendList(stopAllPhaselist(r, true))
248252
}
249253

250254
// Reset is the reset sequence.

internal/integration/api/reboot.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ import (
1010
"context"
1111
"fmt"
1212
"sync"
13+
"sync/atomic"
1314
"testing"
1415
"time"
1516

1617
"github.com/siderolabs/go-retry/retry"
1718

1819
"github.com/siderolabs/talos/internal/integration/base"
20+
machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
1921
"github.com/siderolabs/talos/pkg/machinery/client"
2022
"github.com/siderolabs/talos/pkg/machinery/config/machine"
2123
)
@@ -71,6 +73,71 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
7173
}
7274
}
7375

76+
// TestForcedReboot force-reboots cluster node by node,
77+
// ensuring that the 'cleanup' phase/'stopAllPods' task doesn't run.
78+
func (suite *RebootSuite) TestForcedReboot() { //nolint:gocyclo
79+
if !suite.Capabilities().SupportsReboot {
80+
suite.T().Skip("cluster doesn't support reboots")
81+
}
82+
83+
nodes := suite.DiscoverNodeInternalIPs(suite.ctx)
84+
suite.Require().NotEmpty(nodes)
85+
86+
for _, node := range nodes {
87+
suite.T().Log("force rebooting node", node)
88+
89+
nodeCtx := client.WithNodes(suite.ctx, node)
90+
91+
var (
92+
sawStopAllPods atomic.Bool
93+
sawCleanupPhase atomic.Bool
94+
)
95+
96+
// watch events so we can verify graceful teardown did not happen
97+
watchCtx, watchCancel := context.WithCancel(nodeCtx)
98+
eventsCh := make(chan client.EventResult)
99+
suite.Require().NoError(suite.Client.EventsWatchV2(watchCtx, eventsCh))
100+
101+
go func() {
102+
for {
103+
select {
104+
case <-watchCtx.Done():
105+
return
106+
case ev := <-eventsCh:
107+
if ev.Error != nil {
108+
continue
109+
}
110+
111+
switch msg := ev.Event.Payload.(type) {
112+
case *machineapi.TaskEvent:
113+
if msg.GetTask() == "stopAllPods" {
114+
sawStopAllPods.Store(true)
115+
}
116+
case *machineapi.PhaseEvent:
117+
if msg.GetPhase() == "cleanup" {
118+
sawCleanupPhase.Store(true)
119+
}
120+
}
121+
}
122+
}
123+
}()
124+
125+
suite.AssertRebooted(
126+
suite.ctx, node, func(nodeCtx context.Context) error {
127+
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx, client.WithForce))
128+
}, 10*time.Minute,
129+
suite.CleanupFailedPods,
130+
)
131+
132+
watchCancel()
133+
134+
suite.Require().Falsef(sawCleanupPhase.Load(), "cleanup phase must not run during forced reboot")
135+
suite.Require().Falsef(sawStopAllPods.Load(), "stopAllPods task must not run during forced reboot")
136+
}
137+
138+
suite.WaitForBootDone(suite.ctx)
139+
}
140+
74141
// TestRebootMultiple reboots a node, issues consequent reboots
75142
// reboot should cancel boot sequence, and cancel another reboot.
76143
func (suite *RebootSuite) TestRebootMultiple() {

pkg/machinery/api/machine/machine.pb.go

Lines changed: 7 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/machinery/client/client.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,11 @@ func WithPowerCycle(req *machineapi.RebootRequest) {
337337
req.Mode = machineapi.RebootRequest_POWERCYCLE
338338
}
339339

340+
// WithForce option runs the Reboot fun in force mode.
341+
func WithForce(req *machineapi.RebootRequest) {
342+
req.Mode = machineapi.RebootRequest_FORCE
343+
}
344+
340345
// Reboot implements the proto.MachineServiceClient interface.
341346
func (c *Client) Reboot(ctx context.Context, opts ...RebootMode) error {
342347
_, err := c.RebootWithResponse(ctx, opts...)

0 commit comments

Comments
 (0)