Skip to content

Commit 08cee91

Browse files
Added test casses for certificate rotation for Hot Reload and Graceful Restart (#464)
1 parent 1bdf558 commit 08cee91

File tree

2 files changed

+911
-0
lines changed

2 files changed

+911
-0
lines changed

test/docker_cert_rotation_test.go

Lines changed: 395 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,395 @@
1+
// DISCLAIMER
2+
//
3+
// # Copyright 2024 ArangoDB GmbH, Cologne, Germany
4+
//
5+
// Licensed under the Apache License, Version 2.0 (the "License");
6+
// you may not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
//
17+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
18+
package test
19+
20+
import (
21+
"fmt"
22+
"os"
23+
"os/exec"
24+
"path/filepath"
25+
"strings"
26+
"testing"
27+
"time"
28+
29+
"github.com/arangodb-helper/arangodb/client"
30+
"github.com/stretchr/testify/require"
31+
)
32+
33+
// TestDockerClusterSSLCertRotationHotReload tests certificate hot reload without restart in Docker containers
34+
// This test validates Scenario 2: Changing certificate content only via /_admin/server/tls endpoint
35+
// This test runs 3 arangodb starters in Docker containers with SSL enabled
36+
//
37+
// NOTE: This test uses --net=host which has known limitations on WSL2:
38+
// - Docker containers can communicate internally (proven via manual testing)
39+
// - But WSL2 host cannot connect to --net=host containers due to networking architecture
40+
// - Works fine on native Linux (CircleCI)
41+
//
42+
// If this test fails with "connection refused" on WSL2, use process-mode tests instead:
43+
// - TestProcessClusterSSLCertRotationHotReload
44+
func TestDockerClusterSSLCertRotationHotReload(t *testing.T) {
45+
// Detect WSL2 and skip if detected (networking issues with --net=host)
46+
if data, err := os.ReadFile("/proc/version"); err == nil {
47+
version := strings.ToLower(string(data))
48+
if strings.Contains(version, "microsoft") || strings.Contains(version, "wsl") {
49+
t.Skip("Skipping Docker mode test on WSL2 - Docker --net=host networking doesn't work properly in WSL2. " +
50+
"The test works on native Linux (CircleCI). Use TestProcessClusterSSLCertRotationHotReload instead.")
51+
}
52+
}
53+
54+
testMatch(t, testModeDocker, starterModeCluster, false)
55+
56+
// Create temporary directory for certificates on host
57+
certDir, err := os.MkdirTemp("", "ssl-cert-test")
58+
require.NoError(t, err, "Failed to create temp directory")
59+
defer os.RemoveAll(certDir)
60+
61+
// Create first certificate
62+
cert1Path := filepath.Join(certDir, "server.keyfile")
63+
err = createTestCertificate(cert1Path, "initial-cert")
64+
require.NoError(t, err, "Failed to create initial certificate")
65+
66+
// Create Docker volumes for data persistence
67+
cID1 := createDockerID("starter-test-ssl-hotreload1-")
68+
createDockerVolume(t, cID1)
69+
defer removeDockerVolume(t, cID1)
70+
71+
cID2 := createDockerID("starter-test-ssl-hotreload2-")
72+
createDockerVolume(t, cID2)
73+
defer removeDockerVolume(t, cID2)
74+
75+
cID3 := createDockerID("starter-test-ssl-hotreload3-")
76+
createDockerVolume(t, cID3)
77+
defer removeDockerVolume(t, cID3)
78+
79+
// Cleanup leftover containers
80+
removeDockerContainersByLabel(t, "starter-test=true")
81+
removeStarterCreatedDockerContainers(t)
82+
83+
// Mount certificate directory into containers
84+
certMount := fmt.Sprintf("-v %s:/certs", certDir)
85+
certArg := "--ssl.keyfile=/certs/server.keyfile"
86+
87+
joins := fmt.Sprintf("localhost:%d,localhost:%d,localhost:%d",
88+
basePort, basePort+(1*portIncrement), basePort+(2*portIncrement))
89+
90+
start := time.Now()
91+
92+
// Start 3 Docker containers with SSL
93+
dockerRun1 := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount)
94+
defer dockerRun1.Close()
95+
defer removeDockerContainer(t, cID1)
96+
97+
dockerRun2 := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount)
98+
defer dockerRun2.Close()
99+
defer removeDockerContainer(t, cID2)
100+
101+
dockerRun3 := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount)
102+
defer dockerRun3.Close()
103+
defer removeDockerContainer(t, cID3)
104+
105+
// Wait for cluster to be ready
106+
if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1, dockerRun2, dockerRun3); ok {
107+
t.Logf("Cluster start with SSL took %s", time.Since(start))
108+
109+
// In Docker mode with SSL, give extra time for SSL endpoints to fully initialize
110+
t.Log("Waiting 30 seconds for SSL endpoints to be ready...")
111+
time.Sleep(30 * time.Second)
112+
for i := 0; i < 3; i++ {
113+
testCluster(t, secureStarterEndpoint(i*portIncrement), true)
114+
}
115+
}
116+
117+
// Get the certificate serial number from all server types before rotation
118+
t.Log("Checking initial certificate on all server types")
119+
initialCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement))
120+
require.NotEmpty(t, initialCertSerials, "Should have initial certificate serials")
121+
122+
// Create and write new certificate to the SAME path on host (mounted into containers)
123+
t.Log("Creating new certificate at the same path (mounted into containers)")
124+
err = createTestCertificate(cert1Path, "rotated-cert")
125+
require.NoError(t, err, "Failed to create rotated certificate")
126+
127+
// Force filesystem sync to ensure certificate file is written
128+
t.Log("Forcing filesystem sync")
129+
syncCmd := exec.Command("sync")
130+
if err := syncCmd.Run(); err != nil {
131+
t.Logf("Warning: sync command failed: %v", err)
132+
}
133+
// Give filesystem time to propagate changes through Docker volume mounts and nested containers
134+
// Docker has multiple layers: host -> starter container -> nested ArangoDB containers
135+
t.Log("Waiting 30 seconds for filesystem to propagate changes through Docker layers...")
136+
time.Sleep(30 * time.Second)
137+
138+
// Trigger hot reload on all Docker nodes with retries
139+
t.Log("Triggering hot reload on all Docker nodes (with retries)...")
140+
for i := 0; i < 3; i++ {
141+
endpoint := secureStarterEndpoint(i * portIncrement)
142+
t.Logf("Attempting hot reload on node-%d (%s)", i+1, endpoint)
143+
err = reloadCertificatesViaAPI(t, endpoint)
144+
require.NoError(t, err, "Failed to reload certificates via API")
145+
}
146+
147+
// Give servers time to reload - Docker OverlayFS caching means this takes longer than process mode
148+
// Coordinators and DBServers need more time than agents to reload in nested containers
149+
t.Log("Waiting 60 seconds for certificates to be reloaded (Docker OverlayFS propagation delay)...")
150+
time.Sleep(60 * time.Second)
151+
152+
// First check: Verify certificates were reloaded
153+
t.Log("First verification: Checking if certificates were reloaded")
154+
newCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement))
155+
require.NotEmpty(t, newCertSerials, "Should have new certificate serials")
156+
157+
// Check which servers have rotated
158+
rotatedServers := make(map[client.ServerType]bool)
159+
for serverType, newSerial := range newCertSerials {
160+
initialSerial := initialCertSerials[serverType]
161+
if newSerial != initialSerial {
162+
t.Logf("Certificate rotated successfully on %s: %s -> %s", serverType, initialSerial, newSerial)
163+
rotatedServers[serverType] = true
164+
} else {
165+
t.Logf("Certificate NOT YET rotated on %s: serial remained %s", serverType, initialSerial)
166+
}
167+
}
168+
169+
// If not all servers have rotated, wait longer and check again
170+
if !rotatedServers[client.ServerTypeCoordinator] || !rotatedServers[client.ServerTypeDBServer] {
171+
t.Log("Some servers haven't rotated yet. Waiting additional 60 seconds for slower Docker OverlayFS propagation...")
172+
time.Sleep(60 * time.Second)
173+
174+
// Second check: Re-verify certificates after additional wait
175+
t.Log("Second verification: Re-checking certificates after extended wait")
176+
newCertSerials = getCertificateSerials(t, secureStarterEndpoint(0*portIncrement))
177+
178+
// Update rotatedServers with new check
179+
for serverType, newSerial := range newCertSerials {
180+
initialSerial := initialCertSerials[serverType]
181+
if newSerial != initialSerial {
182+
if !rotatedServers[serverType] {
183+
t.Logf("Certificate NOW rotated on %s (after extended wait): %s -> %s", serverType, initialSerial, newSerial)
184+
}
185+
rotatedServers[serverType] = true
186+
} else {
187+
t.Errorf("Certificate STILL NOT rotated on %s after 180s total wait: serial remained %s", serverType, initialSerial)
188+
}
189+
}
190+
}
191+
192+
// Verify all server types rotated successfully
193+
require.True(t, rotatedServers[client.ServerTypeCoordinator], "Coordinator should have successfully rotated certificate")
194+
require.True(t, rotatedServers[client.ServerTypeDBServer], "DBServer should have successfully rotated certificate")
195+
require.True(t, rotatedServers[client.ServerTypeAgent], "Agent should have successfully rotated certificate")
196+
197+
// Verify cluster still works after rotation
198+
t.Log("Verifying cluster functionality after certificate rotation")
199+
testCluster(t, secureStarterEndpoint(0*portIncrement), true)
200+
201+
// Graceful shutdown
202+
waitForCallFunction(t,
203+
ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)),
204+
ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)),
205+
ShutdownStarterCall(secureStarterEndpoint(2*portIncrement)))
206+
}
207+
208+
// TestDockerClusterSSLCertRotationGracefulRestart tests certificate rotation via graceful restart in Docker containers
209+
// This test validates Scenario 1: Replacing certificate file and restarting cluster
210+
// This test runs 3 arangodb starters in Docker containers with SSL enabled
211+
//
212+
// NOTE: This test uses --net=host which has known limitations on WSL2:
213+
// - Docker containers can communicate internally (proven via manual testing)
214+
// - But WSL2 host cannot connect to --net=host containers due to networking architecture
215+
// - Works fine on native Linux (CircleCI)
216+
//
217+
// If this test fails with "connection refused" on WSL2, use process-mode tests instead:
218+
// - TestProcessClusterReplaceCert
219+
func TestDockerClusterSSLCertRotationGracefulRestart(t *testing.T) {
220+
// Detect WSL2 and skip if detected (networking issues with --net=host)
221+
if data, err := os.ReadFile("/proc/version"); err == nil {
222+
version := strings.ToLower(string(data))
223+
if strings.Contains(version, "microsoft") || strings.Contains(version, "wsl") {
224+
t.Skip("Skipping Docker mode test on WSL2 - Docker --net=host networking doesn't work properly in WSL2. " +
225+
"The test works on native Linux (CircleCI). Use TestProcessClusterReplaceCert instead.")
226+
}
227+
}
228+
229+
testMatch(t, testModeDocker, starterModeCluster, false)
230+
231+
// Create temporary directory for certificates on host
232+
certDir, err := os.MkdirTemp("", "ssl-cert-restart-test")
233+
require.NoError(t, err, "Failed to create temp directory")
234+
defer os.RemoveAll(certDir)
235+
236+
// Create first certificate
237+
cert1Path := filepath.Join(certDir, "server.keyfile")
238+
err = createTestCertificate(cert1Path, "initial-cert")
239+
require.NoError(t, err, "Failed to create initial certificate")
240+
241+
// Create Docker volumes for data persistence
242+
cID1 := createDockerID("starter-test-ssl-restart1-")
243+
createDockerVolume(t, cID1)
244+
defer removeDockerVolume(t, cID1)
245+
246+
cID2 := createDockerID("starter-test-ssl-restart2-")
247+
createDockerVolume(t, cID2)
248+
defer removeDockerVolume(t, cID2)
249+
250+
cID3 := createDockerID("starter-test-ssl-restart3-")
251+
createDockerVolume(t, cID3)
252+
defer removeDockerVolume(t, cID3)
253+
254+
// Cleanup leftover containers
255+
removeDockerContainersByLabel(t, "starter-test=true")
256+
removeStarterCreatedDockerContainers(t)
257+
258+
// Mount certificate directory into containers
259+
certMount := fmt.Sprintf("-v %s:/certs", certDir)
260+
certArg := "--ssl.keyfile=/certs/server.keyfile"
261+
262+
joins := fmt.Sprintf("localhost:%d,localhost:%d,localhost:%d",
263+
basePort, basePort+(1*portIncrement), basePort+(2*portIncrement))
264+
265+
start := time.Now()
266+
267+
// Start 3 Docker containers with SSL
268+
dockerRun1 := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount)
269+
defer dockerRun1.Close()
270+
defer removeDockerContainer(t, cID1)
271+
272+
dockerRun2 := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount)
273+
defer dockerRun2.Close()
274+
defer removeDockerContainer(t, cID2)
275+
276+
dockerRun3 := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount)
277+
defer dockerRun3.Close()
278+
defer removeDockerContainer(t, cID3)
279+
280+
// Wait for cluster to be ready
281+
if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1, dockerRun2, dockerRun3); ok {
282+
t.Logf("Cluster start with SSL took %s", time.Since(start))
283+
284+
// In Docker mode with SSL, give extra time for SSL endpoints to fully initialize
285+
t.Log("Waiting 30 seconds for SSL endpoints to be ready...")
286+
time.Sleep(30 * time.Second)
287+
288+
for i := 0; i < 3; i++ {
289+
testCluster(t, secureStarterEndpoint(i*portIncrement), true)
290+
}
291+
}
292+
293+
// Get the certificate serial number from all server types before rotation
294+
t.Log("Checking initial certificate on all server types")
295+
initialCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement))
296+
require.NotEmpty(t, initialCertSerials, "Should have initial certificate serials")
297+
298+
// Gracefully shutdown all starters
299+
t.Log("Shutting down cluster for certificate replacement")
300+
waitForCallFunction(t,
301+
ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)),
302+
ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)),
303+
ShutdownStarterCall(secureStarterEndpoint(2*portIncrement)),
304+
)
305+
306+
// Wait for graceful shutdown
307+
t.Log("Waiting 15 seconds for graceful shutdown...")
308+
time.Sleep(15 * time.Second)
309+
310+
// Stop any remaining nested containers (they may not stop automatically)
311+
t.Log("Stopping any remaining nested containers...")
312+
removeStarterCreatedDockerContainers(t)
313+
314+
// Verify all containers stopped
315+
t.Log("Verifying all containers stopped")
316+
time.Sleep(5 * time.Second)
317+
318+
// Replace certificate file while cluster is stopped
319+
t.Log("Replacing certificate file with new one")
320+
err = createTestCertificate(cert1Path, "restarted-cert")
321+
require.NoError(t, err, "Failed to create new certificate")
322+
323+
// Force filesystem sync
324+
t.Log("Forcing filesystem sync")
325+
syncCmd := exec.Command("sync")
326+
if err := syncCmd.Run(); err != nil {
327+
t.Logf("Warning: sync command failed: %v", err)
328+
}
329+
time.Sleep(2 * time.Second)
330+
331+
// Restart cluster with SAME volumes (data persists) and NEW certificate
332+
t.Log("Restarting cluster with same volumes (preserving data) and new certificate")
333+
restartStart := time.Now()
334+
335+
dockerRun1Restart := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount)
336+
defer dockerRun1Restart.Close()
337+
338+
dockerRun2Restart := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount)
339+
defer dockerRun2Restart.Close()
340+
341+
dockerRun3Restart := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount)
342+
defer dockerRun3Restart.Close()
343+
344+
// Wait for cluster to restart (should be faster since data already exists)
345+
if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1Restart, dockerRun2Restart, dockerRun3Restart); ok {
346+
t.Logf("Cluster restarted with new certificate took %s", time.Since(restartStart))
347+
348+
// Give SSL endpoints time to initialize after restart
349+
t.Log("Waiting 30 seconds for SSL endpoints to be ready after restart...")
350+
time.Sleep(30 * time.Second)
351+
352+
// Verify cluster functionality
353+
for i := 0; i < 3; i++ {
354+
testCluster(t, secureStarterEndpoint(i*portIncrement), true)
355+
}
356+
357+
// Verify certificates were replaced
358+
t.Log("Verifying that certificates changed after restart")
359+
newCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement))
360+
require.NotEmpty(t, newCertSerials, "Should have new certificate serials")
361+
362+
// Verify all server types have new certificates
363+
for serverType, newSerial := range newCertSerials {
364+
oldSerial := initialCertSerials[serverType]
365+
if oldSerial == "" {
366+
t.Errorf("No initial certificate found for %s", serverType)
367+
continue
368+
}
369+
if newSerial == "" {
370+
t.Errorf("No new certificate found for %s", serverType)
371+
continue
372+
}
373+
if oldSerial == newSerial {
374+
t.Errorf("Certificate NOT replaced on %s: serial remained %s", serverType, oldSerial)
375+
} else {
376+
t.Logf("Certificate replaced on %s: %s -> %s", serverType, oldSerial, newSerial)
377+
}
378+
}
379+
380+
require.NotEqual(t, initialCertSerials[client.ServerTypeCoordinator],
381+
newCertSerials[client.ServerTypeCoordinator], "Coordinator certificate should change")
382+
require.NotEqual(t, initialCertSerials[client.ServerTypeDBServer],
383+
newCertSerials[client.ServerTypeDBServer], "DBServer certificate should change")
384+
require.NotEqual(t, initialCertSerials[client.ServerTypeAgent],
385+
newCertSerials[client.ServerTypeAgent], "Agent certificate should change")
386+
387+
t.Log("All certificates successfully rotated after graceful restart")
388+
}
389+
390+
// Final graceful shutdown
391+
waitForCallFunction(t,
392+
ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)),
393+
ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)),
394+
ShutdownStarterCall(secureStarterEndpoint(2*portIncrement)))
395+
}

0 commit comments

Comments
 (0)