Skip to content

Commit c653990

Browse files
committed
feat: add etcd cold boot recovery tests from graceful node shutdown
Add three new test cases to validate etcd cluster recovery from cold boot scenarios reached through different graceful/ungraceful shutdown combinations: - Cold boot from double GNS: both nodes gracefully shut down simultaneously, then both restart (full cluster cold boot) - Cold boot from sequential GNS: first node gracefully shut down, then second node gracefully shut down, then both restart - Cold boot from mixed GNS/UGNS: first node gracefully shut down, surviving node then ungracefully shut down, then both restart Note: The inverse case (UGNS first node, then GNS second) is not tested because in TNF clusters, an ungracefully shut down node is quickly recovered, preventing the ability to wait and gracefully shut down the second node later. The double UGNS scenario is already covered by existing tests.
1 parent 3b60d77 commit c653990

File tree

1 file changed

+193
-21
lines changed

1 file changed

+193
-21
lines changed

test/extended/two_node/tnf_recovery.go

Lines changed: 193 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ const (
3131
memberPromotedVotingTimeout = 15 * time.Minute
3232
networkDisruptionDuration = 15 * time.Second
3333
vmRestartTimeout = 5 * time.Minute
34-
vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful VM shutdown is typically fast
35-
membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM recovering up to Etcd member healthy
34+
vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful shutdown is typically fast
35+
vmGracefulShutdownTimeout = 10 * time.Minute // Graceful shutdown is typically slow
36+
membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM reboot and Etcd member healthy
3637
pollInterval = 5 * time.Second
3738
)
3839

@@ -188,20 +189,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
188189
c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB)
189190
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
190191

191-
dataPair := []struct {
192-
vm, node string
193-
}{
192+
dataPair := []vmNodePair{
194193
{vmA, nodeA.Name},
195194
{vmB, nodeB.Name},
196195
}
197196

198-
defer func() {
199-
for _, d := range dataPair {
200-
if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil {
201-
fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err)
202-
}
203-
}
204-
}()
197+
defer restartVms(dataPair, c)
205198

206199
g.By("Simulating double node failure: stopping both nodes' VMs")
207200
// First, stop all VMs
@@ -216,22 +209,142 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
216209
}
217210

218211
g.By("Restarting both nodes")
219-
// Start all VMs
212+
restartVms(dataPair, c)
213+
214+
g.By("Waiting both etcd members to become healthy")
215+
validateEtcdRecoveryState(etcdClientFactory,
216+
&nodeA, true, false, // member on node A expected started == true, learner == false
217+
&nodeB, true, false, // member on node B expected started == true, learner == false
218+
membersHealthyAfterDoubleReboot, pollInterval)
219+
})
220+
221+
g.It("should recover from double graceful node shutdown", func() {
222+
// Note: Both nodes are gracefully shut down, then both restart
223+
nodeA := peerNode
224+
nodeB := targetNode
225+
g.GinkgoT().Printf("Testing double node graceful shutdown for %s and %s\n", nodeA.Name, nodeB.Name)
226+
227+
c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB)
228+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
229+
230+
dataPair := []vmNodePair{
231+
{vmA, nodeA.Name},
232+
{vmB, nodeB.Name},
233+
}
234+
235+
defer restartVms(dataPair, c)
236+
237+
g.By(fmt.Sprintf("Gracefully shutting down both nodes at the same time (timeout: %v)", vmGracefulShutdownTimeout))
220238
for _, d := range dataPair {
221-
err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
222-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to start VM %s (node: %s)", d.vm, d.node))
239+
innerErr := services.VirshShutdownVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
240+
o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected to gracefully shutdown VM %s (node: %s)", d.vm, d.node))
223241
}
224-
// Wait for all to be running
242+
225243
for _, d := range dataPair {
226-
err := services.WaitForVMState(d.vm, services.VMStateRunning, vmUngracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
227-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout))
244+
innerErr := services.WaitForVMState(d.vm, services.VMStateShutOff, vmGracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
245+
o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to reach shut off state", d.vm, d.node))
228246
}
229247

230-
g.By("Waiting both etcd members to become healthy")
248+
g.By("Restarting both nodes")
249+
restartVms(dataPair, c)
250+
251+
g.By("Waiting both etcd members to become healthy and voting")
231252
validateEtcdRecoveryState(etcdClientFactory,
232-
&nodeA, true, false, // member on node A expected started == true, learner == false
233-
&nodeB, true, false, // member on node B expected started == true, learner == false
253+
&nodeA, true, false, // nodeA expected started == true, learner == false
254+
&nodeB, true, false, // nodeB expected started == true, learner == false
255+
membersHealthyAfterDoubleReboot, pollInterval)
256+
257+
g.By("Verifying etcd operator health after recovery")
258+
o.Eventually(func() error {
259+
return ensureEtcdOperatorHealthy(oc)
260+
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery")
261+
})
262+
263+
g.It("should recover from sequential graceful node shutdowns", func() {
264+
// Note: First node is gracefully shut down, then the second, then both restart
265+
firstToShutdown := peerNode
266+
secondToShutdown := targetNode
267+
g.GinkgoT().Printf("Testing sequential graceful shutdowns: first %s, then %s\n",
268+
firstToShutdown.Name, secondToShutdown.Name)
269+
270+
c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown)
271+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
272+
273+
dataPair := []vmNodePair{
274+
{vmFirstToShutdown, firstToShutdown.Name},
275+
{vmSecondToShutdown, secondToShutdown.Name},
276+
}
277+
278+
defer restartVms(dataPair, c)
279+
280+
g.By(fmt.Sprintf("Gracefully shutting down first node: %s", firstToShutdown.Name))
281+
282+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c)
283+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown))
284+
285+
g.By(fmt.Sprintf("Gracefully shutting down second node: %s", secondToShutdown.Name))
286+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmSecondToShutdown, c)
287+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown))
288+
289+
g.By("Restarting both nodes")
290+
restartVms(dataPair, c)
291+
292+
g.By("Waiting both etcd members to become healthy and voting")
293+
validateEtcdRecoveryState(etcdClientFactory,
294+
&firstToShutdown, true, false, // nodeA expected started == true, learner == false
295+
&secondToShutdown, true, false, // nodeB expected started == true, learner == false
296+
membersHealthyAfterDoubleReboot, pollInterval)
297+
298+
g.By("Verifying etcd operator health after recovery")
299+
o.Eventually(func() error {
300+
return ensureEtcdOperatorHealthy(oc)
301+
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery")
302+
})
303+
304+
g.It("should recover from graceful shutdown followed by ungraceful node failure", func() {
305+
// Note: First node is gracefully shut down, then the survived node fails ungracefully
306+
firstToShutdown := targetNode
307+
secondToShutdown := peerNode
308+
g.GinkgoT().Printf("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n",
309+
firstToShutdown.Name, secondToShutdown.Name)
310+
311+
c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown)
312+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
313+
314+
dataPair := []vmNodePair{
315+
{vmFirstToShutdown, firstToShutdown.Name},
316+
{vmSecondToShutdown, secondToShutdown.Name},
317+
}
318+
319+
defer restartVms(dataPair, c)
320+
321+
g.By(fmt.Sprintf("Gracefully shutting down VM %s (node: %s)", vmFirstToShutdown, firstToShutdown.Name))
322+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c)
323+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown))
324+
325+
g.By(fmt.Sprintf("Waiting for %s to recover cluster and work standalone", secondToShutdown.Name))
326+
validateEtcdRecoveryState(etcdClientFactory,
327+
&secondToShutdown, true, false, // survivedNode expected started == true, learner == false
328+
&firstToShutdown, false, true, // gracefulNode expected started == false, learner == true
329+
memberIsLeaderTimeout, pollInterval)
330+
331+
g.By(fmt.Sprintf("Ungracefully shutting down VM %s (node: %s)", vmSecondToShutdown, secondToShutdown.Name))
332+
err = vmShutdownAndWait(VMShutdownModeUngraceful, vmSecondToShutdown, c)
333+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown))
334+
335+
g.By("Restarting both nodes")
336+
restartVms(dataPair, c)
337+
338+
g.By("Waiting both etcd members to become healthy and voting")
339+
validateEtcdRecoveryState(etcdClientFactory,
340+
&secondToShutdown, true, false, // survivedNode expected started == true, learner == false
341+
&firstToShutdown, true, false, // gracefulNode expected started == true, learner == false
234342
membersHealthyAfterDoubleReboot, pollInterval)
343+
344+
g.By("Verifying etcd operator health after recovery")
345+
o.Eventually(func() error {
346+
return ensureEtcdOperatorHealthy(oc)
347+
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery")
235348
})
236349
})
237350

@@ -480,3 +593,62 @@ func setupMinimalTestEnvironment(oc *util.CLI, nodeA, nodeB *corev1.Node) (c hyp
480593

481594
return
482595
}
596+
597+
type vmNodePair struct {
598+
vm, node string
599+
}
600+
601+
type VMShutdownMode int
602+
603+
const (
604+
VMShutdownModeGraceful VMShutdownMode = iota + 1
605+
VMShutdownModeUngraceful
606+
)
607+
608+
func (sm VMShutdownMode) String() string {
609+
switch sm {
610+
case VMShutdownModeGraceful:
611+
return "graceful VM shutdown"
612+
case VMShutdownModeUngraceful:
613+
return "ungraceful VM shutdown"
614+
}
615+
return "unknown vm shutdown mode"
616+
}
617+
618+
func vmShutdownAndWait(mode VMShutdownMode, vm string, c hypervisorExtendedConfig) error {
619+
var timeout time.Duration
620+
var shutdownFunc func(vmName string, sshConfig *core.SSHConfig, knownHostsPath string) error
621+
switch mode {
622+
case VMShutdownModeGraceful:
623+
timeout = vmGracefulShutdownTimeout
624+
shutdownFunc = services.VirshShutdownVM
625+
case VMShutdownModeUngraceful:
626+
timeout = vmUngracefulShutdownTimeout
627+
shutdownFunc = services.VirshDestroyVM
628+
default:
629+
return fmt.Errorf("unexpected VMShutdownMode: %s", mode)
630+
}
631+
632+
g.GinkgoT().Printf("%s: vm %s (timeout: %v)\n", mode, vm, timeout)
633+
err := shutdownFunc(vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
634+
if err != nil {
635+
return err
636+
}
637+
638+
return services.WaitForVMState(vm, services.VMStateShutOff, timeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
639+
}
640+
641+
func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) {
642+
// Start all VMs asynchronously
643+
for _, d := range dataPair {
644+
if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil {
645+
fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err)
646+
}
647+
}
648+
649+
// Wait for all VMs to be running
650+
for _, d := range dataPair {
651+
err := services.WaitForVMState(d.vm, services.VMStateRunning, vmRestartTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
652+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout))
653+
}
654+
}

0 commit comments

Comments
 (0)