|
| 1 | +// DISCLAIMER |
| 2 | +// |
| 3 | +// # Copyright 2024 ArangoDB GmbH, Cologne, Germany |
| 4 | +// |
| 5 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +// you may not use this file except in compliance with the License. |
| 7 | +// You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, software |
| 12 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +// See the License for the specific language governing permissions and |
| 15 | +// limitations under the License. |
| 16 | +// |
| 17 | +// Copyright holder is ArangoDB GmbH, Cologne, Germany |
| 18 | +package test |
| 19 | + |
| 20 | +import ( |
| 21 | + "fmt" |
| 22 | + "os" |
| 23 | + "os/exec" |
| 24 | + "path/filepath" |
| 25 | + "strings" |
| 26 | + "testing" |
| 27 | + "time" |
| 28 | + |
| 29 | + "github.com/arangodb-helper/arangodb/client" |
| 30 | + "github.com/stretchr/testify/require" |
| 31 | +) |
| 32 | + |
| 33 | +// TestDockerClusterSSLCertRotationHotReload tests certificate hot reload without restart in Docker containers |
| 34 | +// This test validates Scenario 2: Changing certificate content only via /_admin/server/tls endpoint |
| 35 | +// This test runs 3 arangodb starters in Docker containers with SSL enabled |
| 36 | +// |
| 37 | +// NOTE: This test uses --net=host which has known limitations on WSL2: |
| 38 | +// - Docker containers can communicate internally (proven via manual testing) |
| 39 | +// - But WSL2 host cannot connect to --net=host containers due to networking architecture |
| 40 | +// - Works fine on native Linux (CircleCI) |
| 41 | +// |
| 42 | +// If this test fails with "connection refused" on WSL2, use process-mode tests instead: |
| 43 | +// - TestProcessClusterSSLCertRotationHotReload |
| 44 | +func TestDockerClusterSSLCertRotationHotReload(t *testing.T) { |
| 45 | + // Detect WSL2 and skip if detected (networking issues with --net=host) |
| 46 | + if data, err := os.ReadFile("/proc/version"); err == nil { |
| 47 | + version := strings.ToLower(string(data)) |
| 48 | + if strings.Contains(version, "microsoft") || strings.Contains(version, "wsl") { |
| 49 | + t.Skip("Skipping Docker mode test on WSL2 - Docker --net=host networking doesn't work properly in WSL2. " + |
| 50 | + "The test works on native Linux (CircleCI). Use TestProcessClusterSSLCertRotationHotReload instead.") |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + testMatch(t, testModeDocker, starterModeCluster, false) |
| 55 | + |
| 56 | + // Create temporary directory for certificates on host |
| 57 | + certDir, err := os.MkdirTemp("", "ssl-cert-test") |
| 58 | + require.NoError(t, err, "Failed to create temp directory") |
| 59 | + defer os.RemoveAll(certDir) |
| 60 | + |
| 61 | + // Create first certificate |
| 62 | + cert1Path := filepath.Join(certDir, "server.keyfile") |
| 63 | + err = createTestCertificate(cert1Path, "initial-cert") |
| 64 | + require.NoError(t, err, "Failed to create initial certificate") |
| 65 | + |
| 66 | + // Create Docker volumes for data persistence |
| 67 | + cID1 := createDockerID("starter-test-ssl-hotreload1-") |
| 68 | + createDockerVolume(t, cID1) |
| 69 | + defer removeDockerVolume(t, cID1) |
| 70 | + |
| 71 | + cID2 := createDockerID("starter-test-ssl-hotreload2-") |
| 72 | + createDockerVolume(t, cID2) |
| 73 | + defer removeDockerVolume(t, cID2) |
| 74 | + |
| 75 | + cID3 := createDockerID("starter-test-ssl-hotreload3-") |
| 76 | + createDockerVolume(t, cID3) |
| 77 | + defer removeDockerVolume(t, cID3) |
| 78 | + |
| 79 | + // Cleanup leftover containers |
| 80 | + removeDockerContainersByLabel(t, "starter-test=true") |
| 81 | + removeStarterCreatedDockerContainers(t) |
| 82 | + |
| 83 | + // Mount certificate directory into containers |
| 84 | + certMount := fmt.Sprintf("-v %s:/certs", certDir) |
| 85 | + certArg := "--ssl.keyfile=/certs/server.keyfile" |
| 86 | + |
| 87 | + joins := fmt.Sprintf("localhost:%d,localhost:%d,localhost:%d", |
| 88 | + basePort, basePort+(1*portIncrement), basePort+(2*portIncrement)) |
| 89 | + |
| 90 | + start := time.Now() |
| 91 | + |
| 92 | + // Start 3 Docker containers with SSL |
| 93 | + dockerRun1 := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount) |
| 94 | + defer dockerRun1.Close() |
| 95 | + defer removeDockerContainer(t, cID1) |
| 96 | + |
| 97 | + dockerRun2 := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount) |
| 98 | + defer dockerRun2.Close() |
| 99 | + defer removeDockerContainer(t, cID2) |
| 100 | + |
| 101 | + dockerRun3 := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount) |
| 102 | + defer dockerRun3.Close() |
| 103 | + defer removeDockerContainer(t, cID3) |
| 104 | + |
| 105 | + // Wait for cluster to be ready |
| 106 | + if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1, dockerRun2, dockerRun3); ok { |
| 107 | + t.Logf("Cluster start with SSL took %s", time.Since(start)) |
| 108 | + |
| 109 | + // In Docker mode with SSL, give extra time for SSL endpoints to fully initialize |
| 110 | + t.Log("Waiting 30 seconds for SSL endpoints to be ready...") |
| 111 | + time.Sleep(30 * time.Second) |
| 112 | + for i := 0; i < 3; i++ { |
| 113 | + testCluster(t, secureStarterEndpoint(i*portIncrement), true) |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + // Get the certificate serial number from all server types before rotation |
| 118 | + t.Log("Checking initial certificate on all server types") |
| 119 | + initialCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement)) |
| 120 | + require.NotEmpty(t, initialCertSerials, "Should have initial certificate serials") |
| 121 | + |
| 122 | + // Create and write new certificate to the SAME path on host (mounted into containers) |
| 123 | + t.Log("Creating new certificate at the same path (mounted into containers)") |
| 124 | + err = createTestCertificate(cert1Path, "rotated-cert") |
| 125 | + require.NoError(t, err, "Failed to create rotated certificate") |
| 126 | + |
| 127 | + // Force filesystem sync to ensure certificate file is written |
| 128 | + t.Log("Forcing filesystem sync") |
| 129 | + syncCmd := exec.Command("sync") |
| 130 | + if err := syncCmd.Run(); err != nil { |
| 131 | + t.Logf("Warning: sync command failed: %v", err) |
| 132 | + } |
| 133 | + // Give filesystem time to propagate changes through Docker volume mounts and nested containers |
| 134 | + // Docker has multiple layers: host -> starter container -> nested ArangoDB containers |
| 135 | + t.Log("Waiting 30 seconds for filesystem to propagate changes through Docker layers...") |
| 136 | + time.Sleep(30 * time.Second) |
| 137 | + |
| 138 | + // Trigger hot reload on all Docker nodes with retries |
| 139 | + t.Log("Triggering hot reload on all Docker nodes (with retries)...") |
| 140 | + for i := 0; i < 3; i++ { |
| 141 | + endpoint := secureStarterEndpoint(i * portIncrement) |
| 142 | + t.Logf("Attempting hot reload on node-%d (%s)", i+1, endpoint) |
| 143 | + err = reloadCertificatesViaAPI(t, endpoint) |
| 144 | + require.NoError(t, err, "Failed to reload certificates via API") |
| 145 | + } |
| 146 | + |
| 147 | + // Give servers time to reload - Docker OverlayFS caching means this takes longer than process mode |
| 148 | + // Coordinators and DBServers need more time than agents to reload in nested containers |
| 149 | + t.Log("Waiting 60 seconds for certificates to be reloaded (Docker OverlayFS propagation delay)...") |
| 150 | + time.Sleep(60 * time.Second) |
| 151 | + |
| 152 | + // First check: Verify certificates were reloaded |
| 153 | + t.Log("First verification: Checking if certificates were reloaded") |
| 154 | + newCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement)) |
| 155 | + require.NotEmpty(t, newCertSerials, "Should have new certificate serials") |
| 156 | + |
| 157 | + // Check which servers have rotated |
| 158 | + rotatedServers := make(map[client.ServerType]bool) |
| 159 | + for serverType, newSerial := range newCertSerials { |
| 160 | + initialSerial := initialCertSerials[serverType] |
| 161 | + if newSerial != initialSerial { |
| 162 | + t.Logf("Certificate rotated successfully on %s: %s -> %s", serverType, initialSerial, newSerial) |
| 163 | + rotatedServers[serverType] = true |
| 164 | + } else { |
| 165 | + t.Logf("Certificate NOT YET rotated on %s: serial remained %s", serverType, initialSerial) |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + // If not all servers have rotated, wait longer and check again |
| 170 | + if !rotatedServers[client.ServerTypeCoordinator] || !rotatedServers[client.ServerTypeDBServer] { |
| 171 | + t.Log("Some servers haven't rotated yet. Waiting additional 60 seconds for slower Docker OverlayFS propagation...") |
| 172 | + time.Sleep(60 * time.Second) |
| 173 | + |
| 174 | + // Second check: Re-verify certificates after additional wait |
| 175 | + t.Log("Second verification: Re-checking certificates after extended wait") |
| 176 | + newCertSerials = getCertificateSerials(t, secureStarterEndpoint(0*portIncrement)) |
| 177 | + |
| 178 | + // Update rotatedServers with new check |
| 179 | + for serverType, newSerial := range newCertSerials { |
| 180 | + initialSerial := initialCertSerials[serverType] |
| 181 | + if newSerial != initialSerial { |
| 182 | + if !rotatedServers[serverType] { |
| 183 | + t.Logf("Certificate NOW rotated on %s (after extended wait): %s -> %s", serverType, initialSerial, newSerial) |
| 184 | + } |
| 185 | + rotatedServers[serverType] = true |
| 186 | + } else { |
| 187 | + t.Errorf("Certificate STILL NOT rotated on %s after 180s total wait: serial remained %s", serverType, initialSerial) |
| 188 | + } |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + // Verify all server types rotated successfully |
| 193 | + require.True(t, rotatedServers[client.ServerTypeCoordinator], "Coordinator should have successfully rotated certificate") |
| 194 | + require.True(t, rotatedServers[client.ServerTypeDBServer], "DBServer should have successfully rotated certificate") |
| 195 | + require.True(t, rotatedServers[client.ServerTypeAgent], "Agent should have successfully rotated certificate") |
| 196 | + |
| 197 | + // Verify cluster still works after rotation |
| 198 | + t.Log("Verifying cluster functionality after certificate rotation") |
| 199 | + testCluster(t, secureStarterEndpoint(0*portIncrement), true) |
| 200 | + |
| 201 | + // Graceful shutdown |
| 202 | + waitForCallFunction(t, |
| 203 | + ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)), |
| 204 | + ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)), |
| 205 | + ShutdownStarterCall(secureStarterEndpoint(2*portIncrement))) |
| 206 | +} |
| 207 | + |
| 208 | +// TestDockerClusterSSLCertRotationGracefulRestart tests certificate rotation via graceful restart in Docker containers |
| 209 | +// This test validates Scenario 1: Replacing certificate file and restarting cluster |
| 210 | +// This test runs 3 arangodb starters in Docker containers with SSL enabled |
| 211 | +// |
| 212 | +// NOTE: This test uses --net=host which has known limitations on WSL2: |
| 213 | +// - Docker containers can communicate internally (proven via manual testing) |
| 214 | +// - But WSL2 host cannot connect to --net=host containers due to networking architecture |
| 215 | +// - Works fine on native Linux (CircleCI) |
| 216 | +// |
| 217 | +// If this test fails with "connection refused" on WSL2, use process-mode tests instead: |
| 218 | +// - TestProcessClusterReplaceCert |
| 219 | +func TestDockerClusterSSLCertRotationGracefulRestart(t *testing.T) { |
| 220 | + // Detect WSL2 and skip if detected (networking issues with --net=host) |
| 221 | + if data, err := os.ReadFile("/proc/version"); err == nil { |
| 222 | + version := strings.ToLower(string(data)) |
| 223 | + if strings.Contains(version, "microsoft") || strings.Contains(version, "wsl") { |
| 224 | + t.Skip("Skipping Docker mode test on WSL2 - Docker --net=host networking doesn't work properly in WSL2. " + |
| 225 | + "The test works on native Linux (CircleCI). Use TestProcessClusterReplaceCert instead.") |
| 226 | + } |
| 227 | + } |
| 228 | + |
| 229 | + testMatch(t, testModeDocker, starterModeCluster, false) |
| 230 | + |
| 231 | + // Create temporary directory for certificates on host |
| 232 | + certDir, err := os.MkdirTemp("", "ssl-cert-restart-test") |
| 233 | + require.NoError(t, err, "Failed to create temp directory") |
| 234 | + defer os.RemoveAll(certDir) |
| 235 | + |
| 236 | + // Create first certificate |
| 237 | + cert1Path := filepath.Join(certDir, "server.keyfile") |
| 238 | + err = createTestCertificate(cert1Path, "initial-cert") |
| 239 | + require.NoError(t, err, "Failed to create initial certificate") |
| 240 | + |
| 241 | + // Create Docker volumes for data persistence |
| 242 | + cID1 := createDockerID("starter-test-ssl-restart1-") |
| 243 | + createDockerVolume(t, cID1) |
| 244 | + defer removeDockerVolume(t, cID1) |
| 245 | + |
| 246 | + cID2 := createDockerID("starter-test-ssl-restart2-") |
| 247 | + createDockerVolume(t, cID2) |
| 248 | + defer removeDockerVolume(t, cID2) |
| 249 | + |
| 250 | + cID3 := createDockerID("starter-test-ssl-restart3-") |
| 251 | + createDockerVolume(t, cID3) |
| 252 | + defer removeDockerVolume(t, cID3) |
| 253 | + |
| 254 | + // Cleanup leftover containers |
| 255 | + removeDockerContainersByLabel(t, "starter-test=true") |
| 256 | + removeStarterCreatedDockerContainers(t) |
| 257 | + |
| 258 | + // Mount certificate directory into containers |
| 259 | + certMount := fmt.Sprintf("-v %s:/certs", certDir) |
| 260 | + certArg := "--ssl.keyfile=/certs/server.keyfile" |
| 261 | + |
| 262 | + joins := fmt.Sprintf("localhost:%d,localhost:%d,localhost:%d", |
| 263 | + basePort, basePort+(1*portIncrement), basePort+(2*portIncrement)) |
| 264 | + |
| 265 | + start := time.Now() |
| 266 | + |
| 267 | + // Start 3 Docker containers with SSL |
| 268 | + dockerRun1 := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount) |
| 269 | + defer dockerRun1.Close() |
| 270 | + defer removeDockerContainer(t, cID1) |
| 271 | + |
| 272 | + dockerRun2 := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount) |
| 273 | + defer dockerRun2.Close() |
| 274 | + defer removeDockerContainer(t, cID2) |
| 275 | + |
| 276 | + dockerRun3 := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount) |
| 277 | + defer dockerRun3.Close() |
| 278 | + defer removeDockerContainer(t, cID3) |
| 279 | + |
| 280 | + // Wait for cluster to be ready |
| 281 | + if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1, dockerRun2, dockerRun3); ok { |
| 282 | + t.Logf("Cluster start with SSL took %s", time.Since(start)) |
| 283 | + |
| 284 | + // In Docker mode with SSL, give extra time for SSL endpoints to fully initialize |
| 285 | + t.Log("Waiting 30 seconds for SSL endpoints to be ready...") |
| 286 | + time.Sleep(30 * time.Second) |
| 287 | + |
| 288 | + for i := 0; i < 3; i++ { |
| 289 | + testCluster(t, secureStarterEndpoint(i*portIncrement), true) |
| 290 | + } |
| 291 | + } |
| 292 | + |
| 293 | + // Get the certificate serial number from all server types before rotation |
| 294 | + t.Log("Checking initial certificate on all server types") |
| 295 | + initialCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement)) |
| 296 | + require.NotEmpty(t, initialCertSerials, "Should have initial certificate serials") |
| 297 | + |
| 298 | + // Gracefully shutdown all starters |
| 299 | + t.Log("Shutting down cluster for certificate replacement") |
| 300 | + waitForCallFunction(t, |
| 301 | + ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)), |
| 302 | + ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)), |
| 303 | + ShutdownStarterCall(secureStarterEndpoint(2*portIncrement)), |
| 304 | + ) |
| 305 | + |
| 306 | + // Wait for graceful shutdown |
| 307 | + t.Log("Waiting 15 seconds for graceful shutdown...") |
| 308 | + time.Sleep(15 * time.Second) |
| 309 | + |
| 310 | + // Stop any remaining nested containers (they may not stop automatically) |
| 311 | + t.Log("Stopping any remaining nested containers...") |
| 312 | + removeStarterCreatedDockerContainers(t) |
| 313 | + |
| 314 | + // Verify all containers stopped |
| 315 | + t.Log("Verifying all containers stopped") |
| 316 | + time.Sleep(5 * time.Second) |
| 317 | + |
| 318 | + // Replace certificate file while cluster is stopped |
| 319 | + t.Log("Replacing certificate file with new one") |
| 320 | + err = createTestCertificate(cert1Path, "restarted-cert") |
| 321 | + require.NoError(t, err, "Failed to create new certificate") |
| 322 | + |
| 323 | + // Force filesystem sync |
| 324 | + t.Log("Forcing filesystem sync") |
| 325 | + syncCmd := exec.Command("sync") |
| 326 | + if err := syncCmd.Run(); err != nil { |
| 327 | + t.Logf("Warning: sync command failed: %v", err) |
| 328 | + } |
| 329 | + time.Sleep(2 * time.Second) |
| 330 | + |
| 331 | + // Restart cluster with SAME volumes (data persists) and NEW certificate |
| 332 | + t.Log("Restarting cluster with same volumes (preserving data) and new certificate") |
| 333 | + restartStart := time.Now() |
| 334 | + |
| 335 | + dockerRun1Restart := spawnMemberInDocker(t, basePort, cID1, joins, certArg, certMount) |
| 336 | + defer dockerRun1Restart.Close() |
| 337 | + |
| 338 | + dockerRun2Restart := spawnMemberInDocker(t, basePort+(1*portIncrement), cID2, joins, certArg, certMount) |
| 339 | + defer dockerRun2Restart.Close() |
| 340 | + |
| 341 | + dockerRun3Restart := spawnMemberInDocker(t, basePort+(2*portIncrement), cID3, joins, certArg, certMount) |
| 342 | + defer dockerRun3Restart.Close() |
| 343 | + |
| 344 | + // Wait for cluster to restart (should be faster since data already exists) |
| 345 | + if ok := WaitUntilStarterReady(t, whatCluster, 3, dockerRun1Restart, dockerRun2Restart, dockerRun3Restart); ok { |
| 346 | + t.Logf("Cluster restarted with new certificate took %s", time.Since(restartStart)) |
| 347 | + |
| 348 | + // Give SSL endpoints time to initialize after restart |
| 349 | + t.Log("Waiting 30 seconds for SSL endpoints to be ready after restart...") |
| 350 | + time.Sleep(30 * time.Second) |
| 351 | + |
| 352 | + // Verify cluster functionality |
| 353 | + for i := 0; i < 3; i++ { |
| 354 | + testCluster(t, secureStarterEndpoint(i*portIncrement), true) |
| 355 | + } |
| 356 | + |
| 357 | + // Verify certificates were replaced |
| 358 | + t.Log("Verifying that certificates changed after restart") |
| 359 | + newCertSerials := getCertificateSerials(t, secureStarterEndpoint(0*portIncrement)) |
| 360 | + require.NotEmpty(t, newCertSerials, "Should have new certificate serials") |
| 361 | + |
| 362 | + // Verify all server types have new certificates |
| 363 | + for serverType, newSerial := range newCertSerials { |
| 364 | + oldSerial := initialCertSerials[serverType] |
| 365 | + if oldSerial == "" { |
| 366 | + t.Errorf("No initial certificate found for %s", serverType) |
| 367 | + continue |
| 368 | + } |
| 369 | + if newSerial == "" { |
| 370 | + t.Errorf("No new certificate found for %s", serverType) |
| 371 | + continue |
| 372 | + } |
| 373 | + if oldSerial == newSerial { |
| 374 | + t.Errorf("Certificate NOT replaced on %s: serial remained %s", serverType, oldSerial) |
| 375 | + } else { |
| 376 | + t.Logf("Certificate replaced on %s: %s -> %s", serverType, oldSerial, newSerial) |
| 377 | + } |
| 378 | + } |
| 379 | + |
| 380 | + require.NotEqual(t, initialCertSerials[client.ServerTypeCoordinator], |
| 381 | + newCertSerials[client.ServerTypeCoordinator], "Coordinator certificate should change") |
| 382 | + require.NotEqual(t, initialCertSerials[client.ServerTypeDBServer], |
| 383 | + newCertSerials[client.ServerTypeDBServer], "DBServer certificate should change") |
| 384 | + require.NotEqual(t, initialCertSerials[client.ServerTypeAgent], |
| 385 | + newCertSerials[client.ServerTypeAgent], "Agent certificate should change") |
| 386 | + |
| 387 | + t.Log("All certificates successfully rotated after graceful restart") |
| 388 | + } |
| 389 | + |
| 390 | + // Final graceful shutdown |
| 391 | + waitForCallFunction(t, |
| 392 | + ShutdownStarterCall(secureStarterEndpoint(0*portIncrement)), |
| 393 | + ShutdownStarterCall(secureStarterEndpoint(1*portIncrement)), |
| 394 | + ShutdownStarterCall(secureStarterEndpoint(2*portIncrement))) |
| 395 | +} |
0 commit comments