@@ -31,8 +31,9 @@ const (
3131 memberPromotedVotingTimeout = 15 * time .Minute
3232 networkDisruptionDuration = 15 * time .Second
3333 vmRestartTimeout = 5 * time .Minute
34- vmUngracefulShutdownTimeout = 30 * time .Second // Ungraceful VM shutdown is typically fast
35- membersHealthyAfterDoubleReboot = 15 * time .Minute // It takes into account full VM recovering up to Etcd member healthy
34+ vmUngracefulShutdownTimeout = 30 * time .Second // Ungraceful shutdown is typically fast
35+ vmGracefulShutdownTimeout = 10 * time .Minute // Graceful shutdown is typically slow
36+ membersHealthyAfterDoubleReboot = 15 * time .Minute // It takes into account full VM reboot and Etcd member healthy
3637 pollInterval = 5 * time .Second
3738)
3839
@@ -188,20 +189,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
188189 c , vmA , vmB , err := setupMinimalTestEnvironment (oc , & nodeA , & nodeB )
189190 o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
190191
191- dataPair := []struct {
192- vm , node string
193- }{
192+ dataPair := []vmNodePair {
194193 {vmA , nodeA .Name },
195194 {vmB , nodeB .Name },
196195 }
197196
198- defer func () {
199- for _ , d := range dataPair {
200- if err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath ); err != nil {
201- fmt .Fprintf (g .GinkgoWriter , "Warning: failed to restart VM %s during cleanup: %v\n " , d .vm , err )
202- }
203- }
204- }()
197+ defer restartVms (dataPair , c )
205198
206199 g .By ("Simulating double node failure: stopping both nodes' VMs" )
207200 // First, stop all VMs
@@ -216,22 +209,142 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
216209 }
217210
218211 g .By ("Restarting both nodes" )
219- // Start all VMs
212+ restartVms (dataPair , c )
213+
214+ g .By ("Waiting both etcd members to become healthy" )
215+ validateEtcdRecoveryState (etcdClientFactory ,
216+ & nodeA , true , false , // member on node A expected started == true, learner == false
217+ & nodeB , true , false , // member on node B expected started == true, learner == false
218+ membersHealthyAfterDoubleReboot , pollInterval )
219+ })
220+
221+ g .It ("should recover from double graceful node shutdown" , func () {
222+ // Note: Both nodes are gracefully shut down, then both restart
223+ nodeA := peerNode
224+ nodeB := targetNode
225+ g .GinkgoT ().Printf ("Testing double node graceful shutdown for %s and %s\n " , nodeA .Name , nodeB .Name )
226+
227+ c , vmA , vmB , err := setupMinimalTestEnvironment (oc , & nodeA , & nodeB )
228+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
229+
230+ dataPair := []vmNodePair {
231+ {vmA , nodeA .Name },
232+ {vmB , nodeB .Name },
233+ }
234+
235+ defer restartVms (dataPair , c )
236+
237+ g .By (fmt .Sprintf ("Gracefully shutting down both nodes at the same time (timeout: %v)" , vmGracefulShutdownTimeout ))
220238 for _ , d := range dataPair {
221- err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
222- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected to start VM %s (node: %s)" , d .vm , d .node ))
239+ innerErr := services .VirshShutdownVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
240+ o .Expect (innerErr ).To (o .BeNil (), fmt .Sprintf ("Expected to gracefully shutdown VM %s (node: %s)" , d .vm , d .node ))
223241 }
224- // Wait for all to be running
242+
225243 for _ , d := range dataPair {
226- err := services .WaitForVMState (d .vm , services .VMStateRunning , vmUngracefulShutdownTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
227- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to start in %s timeout " , d .vm , d .node , vmRestartTimeout ))
244+ innerErr := services .WaitForVMState (d .vm , services .VMStateShutOff , vmGracefulShutdownTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
245+ o .Expect (innerErr ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to reach shut off state " , d .vm , d .node ))
228246 }
229247
230- g .By ("Waiting both etcd members to become healthy" )
248+ g .By ("Restarting both nodes" )
249+ restartVms (dataPair , c )
250+
251+ g .By ("Waiting both etcd members to become healthy and voting" )
231252 validateEtcdRecoveryState (etcdClientFactory ,
232- & nodeA , true , false , // member on node A expected started == true, learner == false
233- & nodeB , true , false , // member on node B expected started == true, learner == false
253+ & nodeA , true , false , // nodeA expected started == true, learner == false
254+ & nodeB , true , false , // nodeB expected started == true, learner == false
255+ membersHealthyAfterDoubleReboot , pollInterval )
256+
257+ g .By ("Verifying etcd operator health after recovery" )
258+ o .Eventually (func () error {
259+ return ensureEtcdOperatorHealthy (oc )
260+ }, etcdOperatorIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "etcd cluster operator should be healthy after recovery" )
261+ })
262+
263+ g .It ("should recover from sequential graceful node shutdowns" , func () {
264+ // Note: First node is gracefully shut down, then the second, then both restart
265+ firstToShutdown := peerNode
266+ secondToShutdown := targetNode
267+ g .GinkgoT ().Printf ("Testing sequential graceful shutdowns: first %s, then %s\n " ,
268+ firstToShutdown .Name , secondToShutdown .Name )
269+
270+ c , vmFirstToShutdown , vmSecondToShutdown , err := setupMinimalTestEnvironment (oc , & firstToShutdown , & secondToShutdown )
271+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
272+
273+ dataPair := []vmNodePair {
274+ {vmFirstToShutdown , firstToShutdown .Name },
275+ {vmSecondToShutdown , secondToShutdown .Name },
276+ }
277+
278+ defer restartVms (dataPair , c )
279+
280+ g .By (fmt .Sprintf ("Gracefully shutting down first node: %s" , firstToShutdown .Name ))
281+
282+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmFirstToShutdown , c )
283+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmFirstToShutdown ))
284+
285+ g .By (fmt .Sprintf ("Gracefully shutting down second node: %s" , secondToShutdown .Name ))
286+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmSecondToShutdown , c )
287+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmSecondToShutdown ))
288+
289+ g .By ("Restarting both nodes" )
290+ restartVms (dataPair , c )
291+
292+ g .By ("Waiting both etcd members to become healthy and voting" )
293+ validateEtcdRecoveryState (etcdClientFactory ,
294+ & firstToShutdown , true , false , // nodeA expected started == true, learner == false
295+ & secondToShutdown , true , false , // nodeB expected started == true, learner == false
296+ membersHealthyAfterDoubleReboot , pollInterval )
297+
298+ g .By ("Verifying etcd operator health after recovery" )
299+ o .Eventually (func () error {
300+ return ensureEtcdOperatorHealthy (oc )
301+ }, etcdOperatorIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "etcd cluster operator should be healthy after recovery" )
302+ })
303+
304+ g .It ("should recover from graceful shutdown followed by ungraceful node failure" , func () {
305+ // Note: First node is gracefully shut down, then the survived node fails ungracefully
306+ firstToShutdown := targetNode
307+ secondToShutdown := peerNode
308+ g .GinkgoT ().Printf ("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n " ,
309+ firstToShutdown .Name , secondToShutdown .Name )
310+
311+ c , vmFirstToShutdown , vmSecondToShutdown , err := setupMinimalTestEnvironment (oc , & firstToShutdown , & secondToShutdown )
312+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
313+
314+ dataPair := []vmNodePair {
315+ {vmFirstToShutdown , firstToShutdown .Name },
316+ {vmSecondToShutdown , secondToShutdown .Name },
317+ }
318+
319+ defer restartVms (dataPair , c )
320+
321+ g .By (fmt .Sprintf ("Gracefully shutting down VM %s (node: %s)" , vmFirstToShutdown , firstToShutdown .Name ))
322+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmFirstToShutdown , c )
323+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmFirstToShutdown ))
324+
325+ g .By (fmt .Sprintf ("Waiting for %s to recover cluster and work standalone" , secondToShutdown .Name ))
326+ validateEtcdRecoveryState (etcdClientFactory ,
327+ & secondToShutdown , true , false , // survivedNode expected started == true, learner == false
328+ & firstToShutdown , false , true , // gracefulNode expected started == false, learner == true
329+ memberIsLeaderTimeout , pollInterval )
330+
331+ g .By (fmt .Sprintf ("Ungracefully shutting down VM %s (node: %s)" , vmSecondToShutdown , secondToShutdown .Name ))
332+ err = vmShutdownAndWait (VMShutdownModeUngraceful , vmSecondToShutdown , c )
333+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmSecondToShutdown ))
334+
335+ g .By ("Restarting both nodes" )
336+ restartVms (dataPair , c )
337+
338+ g .By ("Waiting both etcd members to become healthy and voting" )
339+ validateEtcdRecoveryState (etcdClientFactory ,
340+ & secondToShutdown , true , false , // survivedNode expected started == true, learner == false
341+ & firstToShutdown , true , false , // gracefulNode expected started == true, learner == false
234342 membersHealthyAfterDoubleReboot , pollInterval )
343+
344+ g .By ("Verifying etcd operator health after recovery" )
345+ o .Eventually (func () error {
346+ return ensureEtcdOperatorHealthy (oc )
347+ }, etcdOperatorIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "etcd cluster operator should be healthy after recovery" )
235348 })
236349})
237350
@@ -480,3 +593,62 @@ func setupMinimalTestEnvironment(oc *util.CLI, nodeA, nodeB *corev1.Node) (c hyp
480593
481594 return
482595}
596+
597+ type vmNodePair struct {
598+ vm , node string
599+ }
600+
601+ type VMShutdownMode int
602+
603+ const (
604+ VMShutdownModeGraceful VMShutdownMode = iota + 1
605+ VMShutdownModeUngraceful
606+ )
607+
608+ func (sm VMShutdownMode ) String () string {
609+ switch sm {
610+ case VMShutdownModeGraceful :
611+ return "graceful VM shutdown"
612+ case VMShutdownModeUngraceful :
613+ return "ungraceful VM shutdown"
614+ }
615+ return "unknown vm shutdown mode"
616+ }
617+
618+ func vmShutdownAndWait (mode VMShutdownMode , vm string , c hypervisorExtendedConfig ) error {
619+ var timeout time.Duration
620+ var shutdownFunc func (vmName string , sshConfig * core.SSHConfig , knownHostsPath string ) error
621+ switch mode {
622+ case VMShutdownModeGraceful :
623+ timeout = vmGracefulShutdownTimeout
624+ shutdownFunc = services .VirshShutdownVM
625+ case VMShutdownModeUngraceful :
626+ timeout = vmUngracefulShutdownTimeout
627+ shutdownFunc = services .VirshDestroyVM
628+ default :
629+ return fmt .Errorf ("unexpected VMShutdownMode: %s" , mode )
630+ }
631+
632+ g .GinkgoT ().Printf ("%s: vm %s (timeout: %v)\n " , mode , vm , timeout )
633+ err := shutdownFunc (vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
634+ if err != nil {
635+ return err
636+ }
637+
638+ return services .WaitForVMState (vm , services .VMStateShutOff , timeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
639+ }
640+
641+ func restartVms (dataPair []vmNodePair , c hypervisorExtendedConfig ) {
642+ // Start all VMs asynchronously
643+ for _ , d := range dataPair {
644+ if err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath ); err != nil {
645+ fmt .Fprintf (g .GinkgoWriter , "Warning: failed to restart VM %s during cleanup: %v\n " , d .vm , err )
646+ }
647+ }
648+
649+ // Wait for all VMs to be running
650+ for _ , d := range dataPair {
651+ err := services .WaitForVMState (d .vm , services .VMStateRunning , vmRestartTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
652+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to start in %s timeout" , d .vm , d .node , vmRestartTimeout ))
653+ }
654+ }
0 commit comments