Skip to content

Commit 9b0af9c

Browse files
committed
feat(bootstrap): resume gateway from existing state and persist SSH handshake secret
Add a resume code path to gateway start so existing Docker volume state (k3s, etcd, sandboxes, secrets) is reused instead of requiring a full destroy/recreate cycle. When the container is gone but the volume remains (e.g. Docker restart), the CLI automatically creates a new container with the existing volume and reconciles PKI and secrets. Move the SSH handshake HMAC secret from ephemeral generation in the cluster entrypoint (regenerated on every container start) to a Kubernetes Secret that persists in etcd on the Docker volume. This ensures sandbox SSH sessions survive container restarts. Key changes: - Add DeployOptions.resume flag with resume branch in deploy flow - Add cleanup_gateway_container for volume-preserving failure cleanup - Auto-resume in gateway_admin_deploy (stopped/volume-only states) - Auto-bootstrap tries resume first, falls back to recreate - Add unless-stopped Docker restart policy to gateway container - Reconcile SSH handshake secret as K8s Secret alongside TLS PKI - Update Helm chart to read secret via secretKeyRef - Add SSH handshake secret to cluster health check Closes #487
1 parent de9dcaa commit 9b0af9c

File tree

11 files changed

+279
-111
lines changed

11 files changed

+279
-111
lines changed

crates/openshell-bootstrap/src/constants.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls";
1111
pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca";
1212
/// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes).
1313
pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls";
14+
/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods).
15+
pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake";
1416

1517
pub fn container_name(name: &str) -> String {
1618
format!("openshell-cluster-{name}")

crates/openshell-bootstrap/src/docker.rs

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ use bollard::Docker;
99
use bollard::errors::Error as BollardError;
1010
use bollard::models::{
1111
ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum,
12-
NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest,
12+
NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, RestartPolicy,
13+
RestartPolicyNameEnum, VolumeCreateRequest,
1314
};
1415
use bollard::query_parameters::{
1516
CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
@@ -532,6 +533,12 @@ pub async fn ensure_container(
532533
port_bindings: Some(port_bindings),
533534
binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
534535
network_mode: Some(network_name(name)),
536+
// Automatically restart the container when Docker restarts, unless the
537+
// user explicitly stopped it with `gateway stop`.
538+
restart_policy: Some(RestartPolicy {
539+
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
540+
maximum_retry_count: None,
541+
}),
535542
// Add host gateway aliases for DNS resolution.
536543
// This allows both the entrypoint script and the running gateway
537544
// process to reach services on the Docker host.
@@ -919,6 +926,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
919926
Ok(())
920927
}
921928

929+
/// Clean up the gateway container and network, preserving the persistent volume.
930+
///
931+
/// Used when a resume attempt fails — we want to remove the container we may
932+
/// have just created but keep the volume so the user can retry without losing
933+
/// their k3s/etcd state and sandbox data.
934+
pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> {
935+
let container_name = container_name(name);
936+
let net_name = network_name(name);
937+
938+
// Disconnect container from network
939+
let _ = docker
940+
.disconnect_network(
941+
&net_name,
942+
NetworkDisconnectRequest {
943+
container: container_name.clone(),
944+
force: Some(true),
945+
},
946+
)
947+
.await;
948+
949+
let _ = stop_container(docker, &container_name).await;
950+
951+
let remove_container = docker
952+
.remove_container(
953+
&container_name,
954+
Some(RemoveContainerOptions {
955+
force: true,
956+
..Default::default()
957+
}),
958+
)
959+
.await;
960+
if let Err(err) = remove_container
961+
&& !is_not_found(&err)
962+
{
963+
return Err(err).into_diagnostic();
964+
}
965+
966+
force_remove_network(docker, &net_name).await?;
967+
968+
Ok(())
969+
}
970+
922971
/// Forcefully remove a Docker network, disconnecting any remaining
923972
/// containers first. This ensures that stale Docker network endpoints
924973
/// cannot prevent port bindings from being released.

crates/openshell-bootstrap/src/lib.rs

Lines changed: 131 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ use miette::{IntoDiagnostic, Result};
2626
use std::sync::{Arc, Mutex};
2727

2828
use crate::constants::{
29-
CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name,
30-
volume_name,
29+
CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME,
30+
SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name,
3131
};
3232
use crate::docker::{
33-
check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
34-
ensure_image, ensure_network, ensure_volume, start_container, stop_container,
33+
check_existing_gateway, check_port_conflicts, cleanup_gateway_container,
34+
destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume,
35+
start_container, stop_container,
3536
};
3637
use crate::metadata::{
3738
create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -119,6 +120,11 @@ pub struct DeployOptions {
119120
/// When false, an existing gateway is left as-is and deployment is
120121
/// skipped (the caller is responsible for prompting the user first).
121122
pub recreate: bool,
123+
/// When true, resume from existing state (volume, stopped container)
124+
/// instead of erroring out. The deploy flow reuses the existing volume
125+
/// and creates a new container if needed, preserving k3s/etcd state,
126+
/// sandbox pods, and secrets.
127+
pub resume: bool,
122128
}
123129

124130
impl DeployOptions {
@@ -135,6 +141,7 @@ impl DeployOptions {
135141
registry_token: None,
136142
gpu: false,
137143
recreate: false,
144+
resume: false,
138145
}
139146
}
140147

@@ -200,6 +207,13 @@ impl DeployOptions {
200207
self.recreate = recreate;
201208
self
202209
}
210+
211+
/// Set whether to resume from existing state (volume, stopped container).
212+
#[must_use]
213+
pub fn with_resume(mut self, resume: bool) -> Self {
214+
self.resume = resume;
215+
self
216+
}
203217
}
204218

205219
#[derive(Debug, Clone)]
@@ -264,6 +278,7 @@ where
264278
let registry_token = options.registry_token;
265279
let gpu = options.gpu;
266280
let recreate = options.recreate;
281+
let resume = options.resume;
267282

268283
// Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
269284
// which needs a 'static callback for the bollard streaming pull.
@@ -288,12 +303,28 @@ where
288303
(preflight.docker, None)
289304
};
290305

291-
// If an existing gateway is found, either tear it down (when recreate is
292-
// requested) or bail out so the caller can prompt the user / reuse it.
306+
// Guard: recreate takes precedence if both are set (shouldn't happen in practice).
307+
if resume && recreate {
308+
tracing::warn!("both resume and recreate set; recreate takes precedence");
309+
}
310+
311+
// If an existing gateway is found, decide how to proceed:
312+
// - recreate: destroy everything and start fresh
313+
// - resume: keep existing state and create/start the container
314+
// - neither: error out (caller should prompt the user)
293315
if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
294316
if recreate {
295317
log("[status] Removing existing gateway".to_string());
296318
destroy_gateway_resources(&target_docker, &name).await?;
319+
} else if resume {
320+
if existing.container_running {
321+
log("[status] Gateway is already running".to_string());
322+
} else {
323+
log("[status] Resuming gateway from existing state".to_string());
324+
}
325+
// Fall through to ensure_* calls — they are idempotent and will
326+
// reuse the existing volume, create a container if needed, and
327+
// start it.
297328
} else {
298329
return Err(miette::miette!(
299330
"Gateway '{name}' already exists (container_running={}).\n\
@@ -455,6 +486,11 @@ where
455486

456487
store_pki_bundle(&name, &pki_bundle)?;
457488

489+
// Reconcile SSH handshake secret: reuse existing K8s secret if present,
490+
// generate and persist a new one otherwise. This secret is stored in etcd
491+
// (on the persistent volume) so it survives container restarts.
492+
reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?;
493+
458494
// Push locally-built component images into the k3s containerd runtime.
459495
// This is the "push" path for local development — images are exported from
460496
// the local Docker daemon and streamed into the cluster's containerd so
@@ -524,15 +560,30 @@ where
524560
docker: target_docker,
525561
}),
526562
Err(deploy_err) => {
527-
// Automatically clean up Docker resources (volume, container, network,
528-
// image) so the environment is left in a retryable state.
529-
tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
530-
if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
531-
tracing::warn!(
532-
"automatic cleanup after failed deploy also failed: {cleanup_err}. \
533-
Manual cleanup may be required: \
534-
openshell gateway destroy --name {name}"
563+
if resume {
564+
// When resuming, preserve the volume so the user can retry.
565+
// Only clean up the container and network that we may have created.
566+
tracing::info!(
567+
"resume failed, cleaning up container for '{name}' (preserving volume)"
535568
);
569+
if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await {
570+
tracing::warn!(
571+
"automatic cleanup after failed resume also failed: {cleanup_err}. \
572+
Manual cleanup may be required: \
573+
openshell gateway destroy --name {name}"
574+
);
575+
}
576+
} else {
577+
// Automatically clean up Docker resources (volume, container, network,
578+
// image) so the environment is left in a retryable state.
579+
tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
580+
if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
581+
tracing::warn!(
582+
"automatic cleanup after failed deploy also failed: {cleanup_err}. \
583+
Manual cleanup may be required: \
584+
openshell gateway destroy --name {name}"
585+
);
586+
}
536587
}
537588
Err(deploy_err)
538589
}
@@ -837,6 +888,72 @@ where
837888
Ok((bundle, true))
838889
}
839890

891+
/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret.
892+
///
893+
/// If the secret already exists in the cluster, this is a no-op. Otherwise a
894+
/// fresh 32-byte hex secret is generated and applied. Because the secret lives
895+
/// in etcd (backed by the persistent Docker volume), it survives container
896+
/// restarts without regeneration — existing sandbox SSH sessions remain valid.
897+
async fn reconcile_ssh_handshake_secret<F>(docker: &Docker, name: &str, log: &F) -> Result<()>
898+
where
899+
F: Fn(String) + Sync,
900+
{
901+
use miette::WrapErr;
902+
903+
let cname = container_name(name);
904+
let kubeconfig = constants::KUBECONFIG_PATH;
905+
906+
// Check if the secret already exists.
907+
let (output, exit_code) = exec_capture_with_exit(
908+
docker,
909+
&cname,
910+
vec![
911+
"sh".to_string(),
912+
"-c".to_string(),
913+
format!(
914+
"KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null"
915+
),
916+
],
917+
)
918+
.await?;
919+
920+
if exit_code == 0 && !output.trim().is_empty() {
921+
tracing::debug!(
922+
"existing SSH handshake secret found ({} bytes encoded)",
923+
output.trim().len()
924+
);
925+
log("[progress] Reusing existing SSH handshake secret".to_string());
926+
return Ok(());
927+
}
928+
929+
// Generate a new 32-byte hex secret and create the K8s secret.
930+
log("[progress] Generating SSH handshake secret".to_string());
931+
let (output, exit_code) = exec_capture_with_exit(
932+
docker,
933+
&cname,
934+
vec![
935+
"sh".to_string(),
936+
"-c".to_string(),
937+
format!(
938+
"SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \
939+
KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \
940+
--from-literal=secret=$SECRET --dry-run=client -o yaml | \
941+
KUBECONFIG={kubeconfig} kubectl apply -f -"
942+
),
943+
],
944+
)
945+
.await?;
946+
947+
if exit_code != 0 {
948+
return Err(miette::miette!(
949+
"failed to create SSH handshake secret (exit {exit_code}): {output}"
950+
))
951+
.wrap_err("failed to apply SSH handshake secret");
952+
}
953+
954+
Ok(())
955+
}
956+
840957
/// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`].
841958
///
842959
/// Returns an error string describing why secrets couldn't be loaded (for logging).

crates/openshell-cli/src/bootstrap.rs

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -144,43 +144,62 @@ pub async fn run_bootstrap(
144144
);
145145
eprintln!();
146146

147-
// Auto-bootstrap always recreates if stale Docker resources are found
148-
// (e.g. metadata was deleted but container/volume still exist).
149-
let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
150-
if let Some(dest) = remote {
151-
let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
152-
if let Some(key) = ssh_key {
153-
remote_opts = remote_opts.with_ssh_key(key);
147+
// Build base deploy options. The auto-bootstrap path tries to resume from
148+
// existing state first (preserving sandboxes and secrets), falling back to
149+
// a full recreate if the resume fails.
150+
let build_options = |resume: bool, recreate: bool| {
151+
let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name)
152+
.with_resume(resume)
153+
.with_recreate(recreate)
154+
.with_gpu(gpu);
155+
if let Some(dest) = remote {
156+
let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
157+
if let Some(key) = ssh_key {
158+
remote_opts = remote_opts.with_ssh_key(key);
159+
}
160+
opts = opts.with_remote(remote_opts);
154161
}
155-
options = options.with_remote(remote_opts);
156-
}
157-
// Read registry credentials from environment for the auto-bootstrap path.
158-
// The explicit `--registry-username` / `--registry-token` flags are only
159-
// on `gateway start`; when bootstrapping via `sandbox create`, the env
160-
// vars are the mechanism.
161-
if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
162-
&& !username.trim().is_empty()
163-
{
164-
options = options.with_registry_username(username);
165-
}
166-
if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
167-
&& !token.trim().is_empty()
168-
{
169-
options = options.with_registry_token(token);
170-
}
171-
// Read gateway host override from environment. Needed whenever the
172-
// client cannot reach the Docker host at 127.0.0.1 — CI containers,
173-
// WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
174-
// is only on `gateway start`; this env var covers the auto-bootstrap
175-
// path triggered by `sandbox create`.
176-
if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
177-
&& !host.trim().is_empty()
162+
// Read registry credentials from environment for the auto-bootstrap path.
163+
// The explicit `--registry-username` / `--registry-token` flags are only
164+
// on `gateway start`; when bootstrapping via `sandbox create`, the env
165+
// vars are the mechanism.
166+
if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
167+
&& !username.trim().is_empty()
168+
{
169+
opts = opts.with_registry_username(username);
170+
}
171+
if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
172+
&& !token.trim().is_empty()
173+
{
174+
opts = opts.with_registry_token(token);
175+
}
176+
// Read gateway host override from environment. Needed whenever the
177+
// client cannot reach the Docker host at 127.0.0.1 — CI containers,
178+
// WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
179+
// is only on `gateway start`; this env var covers the auto-bootstrap
180+
// path triggered by `sandbox create`.
181+
if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
182+
&& !host.trim().is_empty()
183+
{
184+
opts = opts.with_gateway_host(host);
185+
}
186+
opts
187+
};
188+
189+
// Try resume first to preserve existing sandboxes and secrets.
190+
let handle = match deploy_gateway_with_panel(
191+
build_options(true, false),
192+
&gateway_name,
193+
location,
194+
)
195+
.await
178196
{
179-
options = options.with_gateway_host(host);
180-
}
181-
options = options.with_gpu(gpu);
182-
183-
let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;
197+
Ok(handle) => handle,
198+
Err(resume_err) => {
199+
tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}");
200+
deploy_gateway_with_panel(build_options(false, true), &gateway_name, location).await?
201+
}
202+
};
184203
let server = handle.gateway_endpoint().to_string();
185204

186205
print_deploy_summary(&gateway_name, &handle);

0 commit comments

Comments
 (0)