Skip to content

Commit e837849

Browse files
authored
feat(bootstrap): resume gateway from existing state and persist SSH handshake secret (#488)
1 parent 7eb1df6 commit e837849

File tree

19 files changed

+1030
-195
lines changed

19 files changed

+1030
-195
lines changed

.github/workflows/e2e-test.yml

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,25 @@ permissions:
1919

2020
jobs:
2121
e2e:
22-
name: E2E
22+
name: "E2E (${{ matrix.suite }})"
2323
runs-on: ${{ inputs.runner }}
2424
timeout-minutes: 30
25+
strategy:
26+
fail-fast: false
27+
matrix:
28+
include:
29+
- suite: python
30+
cluster: e2e-python
31+
port: "8080"
32+
cmd: "mise run --no-prepare --skip-deps e2e:python"
33+
- suite: rust
34+
cluster: e2e-rust
35+
port: "8081"
36+
cmd: "mise run --no-prepare --skip-deps e2e:rust"
37+
- suite: gateway-resume
38+
cluster: e2e-resume
39+
port: "8082"
40+
cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume"
2541
container:
2642
image: ghcr.io/nvidia/openshell/ci:latest
2743
credentials:
@@ -38,6 +54,7 @@ jobs:
3854
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
3955
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
4056
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
57+
OPENSHELL_GATEWAY: ${{ matrix.cluster }}
4158
steps:
4259
- uses: actions/checkout@v4
4360

@@ -48,21 +65,26 @@ jobs:
4865
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
4966

5067
- name: Install Python dependencies and generate protobuf stubs
68+
if: matrix.suite == 'python'
5169
run: uv sync --frozen && mise run --no-prepare python:proto
5270

53-
- name: Bootstrap and deploy cluster
71+
- name: Build Rust CLI
72+
if: matrix.suite != 'python'
73+
run: cargo build -p openshell-cli --features openshell-core/dev-settings
74+
75+
- name: Install SSH client
76+
if: matrix.suite != 'python'
77+
run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
78+
79+
- name: Bootstrap cluster
5480
env:
5581
GATEWAY_HOST: host.docker.internal
56-
GATEWAY_PORT: "8080"
82+
GATEWAY_PORT: ${{ matrix.port }}
83+
CLUSTER_NAME: ${{ matrix.cluster }}
5784
SKIP_IMAGE_PUSH: "1"
5885
SKIP_CLUSTER_IMAGE_BUILD: "1"
5986
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
6087
run: mise run --no-prepare --skip-deps cluster
6188

62-
- name: Install SSH client for Rust CLI e2e tests
63-
run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
64-
65-
- name: Run E2E tests
66-
run: |
67-
mise run --no-prepare --skip-deps e2e:python
68-
mise run --no-prepare --skip-deps e2e:rust
89+
- name: Run tests
90+
run: ${{ matrix.cmd }}

crates/openshell-bootstrap/src/constants.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls";
1111
pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca";
1212
/// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes).
1313
pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls";
14+
/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods).
15+
pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake";
1416

1517
pub fn container_name(name: &str) -> String {
1618
format!("openshell-cluster-{name}")

crates/openshell-bootstrap/src/docker.rs

Lines changed: 150 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ use bollard::API_DEFAULT_VERSION;
88
use bollard::Docker;
99
use bollard::errors::Error as BollardError;
1010
use bollard::models::{
11-
ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum,
12-
NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest,
11+
ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum,
12+
NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding,
13+
RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest,
1314
};
1415
use bollard::query_parameters::{
1516
CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
@@ -466,6 +467,9 @@ pub async fn ensure_image(
466467
Ok(())
467468
}
468469

470+
/// Returns the actual host port the container is using. When an existing
471+
/// container is reused (same image), this may differ from `gateway_port`
472+
/// because the container was originally created with a different port.
469473
pub async fn ensure_container(
470474
docker: &Docker,
471475
name: &str,
@@ -478,7 +482,7 @@ pub async fn ensure_container(
478482
registry_username: Option<&str>,
479483
registry_token: Option<&str>,
480484
device_ids: &[String],
481-
) -> Result<()> {
485+
) -> Result<u16> {
482486
let container_name = container_name(name);
483487

484488
// Check if the container already exists
@@ -505,10 +509,37 @@ pub async fn ensure_container(
505509
};
506510

507511
if image_matches {
508-
return Ok(());
512+
// The container exists with the correct image, but its network
513+
// attachment may be stale. When the gateway is resumed after a
514+
// container kill, `ensure_network` destroys and recreates the
515+
// Docker network (giving it a new ID). The stopped container
516+
// still references the old network ID, so `docker start` would
517+
// fail with "network <old-id> not found".
518+
//
519+
// Fix: disconnect from any existing networks and reconnect to
520+
// the current (just-created) network before returning.
521+
let expected_net = network_name(name);
522+
reconcile_container_network(docker, &container_name, &expected_net).await?;
523+
524+
// Read the actual host port from the container's port bindings
525+
// as a cross-check. The caller should already pass the correct
526+
// port (from stored metadata), but this catches mismatches if
527+
// the container was recreated with a different port externally.
528+
let actual_port = info
529+
.host_config
530+
.as_ref()
531+
.and_then(|hc| hc.port_bindings.as_ref())
532+
.and_then(|pb| pb.get("30051/tcp"))
533+
.and_then(|bindings| bindings.as_ref())
534+
.and_then(|bindings| bindings.first())
535+
.and_then(|b| b.host_port.as_ref())
536+
.and_then(|p| p.parse::<u16>().ok())
537+
.unwrap_or(gateway_port);
538+
539+
return Ok(actual_port);
509540
}
510541

511-
// Image changed — remove the stale container so we can recreate it
542+
// Image changed — remove the stale container so we can recreate it.
512543
tracing::info!(
513544
"Container {} exists but uses a different image (container={}, desired={}), recreating",
514545
container_name,
@@ -555,6 +586,12 @@ pub async fn ensure_container(
555586
port_bindings: Some(port_bindings),
556587
binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
557588
network_mode: Some(network_name(name)),
589+
// Automatically restart the container when Docker restarts, unless the
590+
// user explicitly stopped it with `gateway stop`.
591+
restart_policy: Some(RestartPolicy {
592+
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
593+
maximum_retry_count: None,
594+
}),
558595
// Add host gateway aliases for DNS resolution.
559596
// This allows both the entrypoint script and the running gateway
560597
// process to reach services on the Docker host.
@@ -734,7 +771,7 @@ pub async fn ensure_container(
734771
.await
735772
.into_diagnostic()
736773
.wrap_err("failed to create gateway container")?;
737-
Ok(())
774+
Ok(gateway_port)
738775
}
739776

740777
/// Information about a container that is holding a port we need.
@@ -956,6 +993,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
956993
Ok(())
957994
}
958995

996+
/// Clean up the gateway container and network, preserving the persistent volume.
997+
///
998+
/// Used when a resume attempt fails — we want to remove the container we may
999+
/// have just created but keep the volume so the user can retry without losing
1000+
/// their k3s/etcd state and sandbox data.
1001+
pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> {
1002+
let container_name = container_name(name);
1003+
let net_name = network_name(name);
1004+
1005+
// Disconnect container from network
1006+
let _ = docker
1007+
.disconnect_network(
1008+
&net_name,
1009+
NetworkDisconnectRequest {
1010+
container: container_name.clone(),
1011+
force: Some(true),
1012+
},
1013+
)
1014+
.await;
1015+
1016+
let _ = stop_container(docker, &container_name).await;
1017+
1018+
let remove_container = docker
1019+
.remove_container(
1020+
&container_name,
1021+
Some(RemoveContainerOptions {
1022+
force: true,
1023+
..Default::default()
1024+
}),
1025+
)
1026+
.await;
1027+
if let Err(err) = remove_container
1028+
&& !is_not_found(&err)
1029+
{
1030+
return Err(err).into_diagnostic();
1031+
}
1032+
1033+
force_remove_network(docker, &net_name).await?;
1034+
1035+
Ok(())
1036+
}
1037+
9591038
/// Forcefully remove a Docker network, disconnecting any remaining
9601039
/// containers first. This ensures that stale Docker network endpoints
9611040
/// cannot prevent port bindings from being released.
@@ -993,6 +1072,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> {
9931072
}
9941073
}
9951074

1075+
/// Ensure a stopped container is connected to the expected Docker network.
1076+
///
1077+
/// When a gateway is resumed after the container was killed (but not removed),
1078+
/// `ensure_network` destroys and recreates the network with a new ID. The
1079+
/// stopped container still holds a reference to the old network ID in its
1080+
/// config, so `docker start` would fail with a 404 "network not found" error.
1081+
///
1082+
/// This function disconnects the container from any networks that no longer
1083+
/// match the expected network name and connects it to the correct one.
1084+
async fn reconcile_container_network(
1085+
docker: &Docker,
1086+
container_name: &str,
1087+
expected_network: &str,
1088+
) -> Result<()> {
1089+
let info = docker
1090+
.inspect_container(container_name, None::<InspectContainerOptions>)
1091+
.await
1092+
.into_diagnostic()
1093+
.wrap_err("failed to inspect container for network reconciliation")?;
1094+
1095+
// Check the container's current network attachments via NetworkSettings.
1096+
let attached_networks: Vec<String> = info
1097+
.network_settings
1098+
.as_ref()
1099+
.and_then(|ns| ns.networks.as_ref())
1100+
.map(|nets| nets.keys().cloned().collect())
1101+
.unwrap_or_default();
1102+
1103+
// If the container is already attached to the expected network (by name),
1104+
// Docker will resolve the name to the current network ID on start.
1105+
// However, when the network was destroyed and recreated, the container's
1106+
// stored endpoint references the old ID. Disconnect and reconnect to
1107+
// pick up the new network ID.
1108+
for net_name in &attached_networks {
1109+
let _ = docker
1110+
.disconnect_network(
1111+
net_name,
1112+
NetworkDisconnectRequest {
1113+
container: container_name.to_string(),
1114+
force: Some(true),
1115+
},
1116+
)
1117+
.await;
1118+
}
1119+
1120+
// Connect to the (freshly created) expected network.
1121+
docker
1122+
.connect_network(
1123+
expected_network,
1124+
NetworkConnectRequest {
1125+
container: container_name.to_string(),
1126+
endpoint_config: Some(EndpointSettings::default()),
1127+
},
1128+
)
1129+
.await
1130+
.into_diagnostic()
1131+
.wrap_err("failed to connect container to gateway network")?;
1132+
1133+
tracing::debug!(
1134+
"Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}"
1135+
);
1136+
1137+
Ok(())
1138+
}
1139+
9961140
fn is_not_found(err: &BollardError) -> bool {
9971141
matches!(
9981142
err,

0 commit comments

Comments
 (0)