@@ -26,12 +26,13 @@ use miette::{IntoDiagnostic, Result};
2626use std:: sync:: { Arc , Mutex } ;
2727
2828use crate :: constants:: {
29- CLIENT_TLS_SECRET_NAME , SERVER_CLIENT_CA_SECRET_NAME , SERVER_TLS_SECRET_NAME , network_name ,
30- volume_name,
29+ CLIENT_TLS_SECRET_NAME , SERVER_CLIENT_CA_SECRET_NAME , SERVER_TLS_SECRET_NAME ,
30+ SSH_HANDSHAKE_SECRET_NAME , network_name , volume_name,
3131} ;
3232use crate :: docker:: {
33- check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
34- ensure_image, ensure_network, ensure_volume, start_container, stop_container,
33+ check_existing_gateway, check_port_conflicts, cleanup_gateway_container,
34+ destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume,
35+ start_container, stop_container,
3536} ;
3637use crate :: metadata:: {
3738 create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -119,6 +120,11 @@ pub struct DeployOptions {
119120 /// When false, an existing gateway is left as-is and deployment is
120121 /// skipped (the caller is responsible for prompting the user first).
121122 pub recreate : bool ,
123+ /// When true, resume from existing state (volume, stopped container)
124+ /// instead of erroring out. The deploy flow reuses the existing volume
125+ /// and creates a new container if needed, preserving k3s/etcd state,
126+ /// sandbox pods, and secrets.
127+ pub resume : bool ,
122128}
123129
124130impl DeployOptions {
@@ -135,6 +141,7 @@ impl DeployOptions {
135141 registry_token : None ,
136142 gpu : false ,
137143 recreate : false ,
144+ resume : false ,
138145 }
139146 }
140147
@@ -200,6 +207,13 @@ impl DeployOptions {
200207 self . recreate = recreate;
201208 self
202209 }
210+
211+ /// Set whether to resume from existing state (volume, stopped container).
212+ #[ must_use]
213+ pub fn with_resume ( mut self , resume : bool ) -> Self {
214+ self . resume = resume;
215+ self
216+ }
203217}
204218
205219#[ derive( Debug , Clone ) ]
@@ -264,6 +278,7 @@ where
264278 let registry_token = options. registry_token ;
265279 let gpu = options. gpu ;
266280 let recreate = options. recreate ;
281+ let resume = options. resume ;
267282
268283 // Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
269284 // which needs a 'static callback for the bollard streaming pull.
@@ -288,12 +303,28 @@ where
288303 ( preflight. docker , None )
289304 } ;
290305
291- // If an existing gateway is found, either tear it down (when recreate is
292- // requested) or bail out so the caller can prompt the user / reuse it.
306+ // Guard: recreate takes precedence if both are set (shouldn't happen in practice).
307+ if resume && recreate {
308+ tracing:: warn!( "both resume and recreate set; recreate takes precedence" ) ;
309+ }
310+
311+ // If an existing gateway is found, decide how to proceed:
312+ // - recreate: destroy everything and start fresh
313+ // - resume: keep existing state and create/start the container
314+ // - neither: error out (caller should prompt the user)
293315 if let Some ( existing) = check_existing_gateway ( & target_docker, & name) . await ? {
294316 if recreate {
295317 log ( "[status] Removing existing gateway" . to_string ( ) ) ;
296318 destroy_gateway_resources ( & target_docker, & name) . await ?;
319+ } else if resume {
320+ if existing. container_running {
321+ log ( "[status] Gateway is already running" . to_string ( ) ) ;
322+ } else {
323+ log ( "[status] Resuming gateway from existing state" . to_string ( ) ) ;
324+ }
325+ // Fall through to ensure_* calls — they are idempotent and will
326+ // reuse the existing volume, create a container if needed, and
327+ // start it.
297328 } else {
298329 return Err ( miette:: miette!(
299330 "Gateway '{name}' already exists (container_running={}).\n \
@@ -455,6 +486,11 @@ where
455486
456487 store_pki_bundle ( & name, & pki_bundle) ?;
457488
489+ // Reconcile SSH handshake secret: reuse existing K8s secret if present,
490+ // generate and persist a new one otherwise. This secret is stored in etcd
491+ // (on the persistent volume) so it survives container restarts.
492+ reconcile_ssh_handshake_secret ( & target_docker, & name, & log) . await ?;
493+
458494 // Push locally-built component images into the k3s containerd runtime.
459495 // This is the "push" path for local development — images are exported from
460496 // the local Docker daemon and streamed into the cluster's containerd so
@@ -524,15 +560,30 @@ where
524560 docker : target_docker,
525561 } ) ,
526562 Err ( deploy_err) => {
527- // Automatically clean up Docker resources (volume, container, network,
528- // image) so the environment is left in a retryable state.
529- tracing:: info!( "deploy failed, cleaning up gateway resources for '{name}'" ) ;
530- if let Err ( cleanup_err) = destroy_gateway_resources ( & target_docker, & name) . await {
531- tracing:: warn!(
532- "automatic cleanup after failed deploy also failed: {cleanup_err}. \
533- Manual cleanup may be required: \
534- openshell gateway destroy --name {name}"
563+ if resume {
564+ // When resuming, preserve the volume so the user can retry.
565+ // Only clean up the container and network that we may have created.
566+ tracing:: info!(
567+ "resume failed, cleaning up container for '{name}' (preserving volume)"
535568 ) ;
569+ if let Err ( cleanup_err) = cleanup_gateway_container ( & target_docker, & name) . await {
570+ tracing:: warn!(
571+ "automatic cleanup after failed resume also failed: {cleanup_err}. \
572+ Manual cleanup may be required: \
573+ openshell gateway destroy --name {name}"
574+ ) ;
575+ }
576+ } else {
577+ // Automatically clean up Docker resources (volume, container, network,
578+ // image) so the environment is left in a retryable state.
579+ tracing:: info!( "deploy failed, cleaning up gateway resources for '{name}'" ) ;
580+ if let Err ( cleanup_err) = destroy_gateway_resources ( & target_docker, & name) . await {
581+ tracing:: warn!(
582+ "automatic cleanup after failed deploy also failed: {cleanup_err}. \
583+ Manual cleanup may be required: \
584+ openshell gateway destroy --name {name}"
585+ ) ;
586+ }
536587 }
537588 Err ( deploy_err)
538589 }
@@ -837,6 +888,72 @@ where
837888 Ok ( ( bundle, true ) )
838889}
839890
891+ /// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret.
892+ ///
893+ /// If the secret already exists in the cluster, this is a no-op. Otherwise a
894+ /// fresh 32-byte hex secret is generated and applied. Because the secret lives
895+ /// in etcd (backed by the persistent Docker volume), it survives container
896+ /// restarts without regeneration — existing sandbox SSH sessions remain valid.
897+ async fn reconcile_ssh_handshake_secret < F > ( docker : & Docker , name : & str , log : & F ) -> Result < ( ) >
898+ where
899+ F : Fn ( String ) + Sync ,
900+ {
901+ use miette:: WrapErr ;
902+
903+ let cname = container_name ( name) ;
904+ let kubeconfig = constants:: KUBECONFIG_PATH ;
905+
906+ // Check if the secret already exists.
907+ let ( output, exit_code) = exec_capture_with_exit (
908+ docker,
909+ & cname,
910+ vec ! [
911+ "sh" . to_string( ) ,
912+ "-c" . to_string( ) ,
913+ format!(
914+ "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null"
915+ ) ,
916+ ] ,
917+ )
918+ . await ?;
919+
920+ if exit_code == 0 && !output. trim ( ) . is_empty ( ) {
921+ tracing:: debug!(
922+ "existing SSH handshake secret found ({} bytes encoded)" ,
923+ output. trim( ) . len( )
924+ ) ;
925+ log ( "[progress] Reusing existing SSH handshake secret" . to_string ( ) ) ;
926+ return Ok ( ( ) ) ;
927+ }
928+
929+ // Generate a new 32-byte hex secret and create the K8s secret.
930+ log ( "[progress] Generating SSH handshake secret" . to_string ( ) ) ;
931+ let ( output, exit_code) = exec_capture_with_exit (
932+ docker,
933+ & cname,
934+ vec ! [
935+ "sh" . to_string( ) ,
936+ "-c" . to_string( ) ,
937+ format!(
938+ "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\ n') && \
939+ KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \
940+ --from-literal=secret=$SECRET --dry-run=client -o yaml | \
941+ KUBECONFIG={kubeconfig} kubectl apply -f -"
942+ ) ,
943+ ] ,
944+ )
945+ . await ?;
946+
947+ if exit_code != 0 {
948+ return Err ( miette:: miette!(
949+ "failed to create SSH handshake secret (exit {exit_code}): {output}"
950+ ) )
951+ . wrap_err ( "failed to apply SSH handshake secret" ) ;
952+ }
953+
954+ Ok ( ( ) )
955+ }
956+
840957/// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`].
841958///
842959/// Returns an error string describing why secrets couldn't be loaded (for logging).
0 commit comments