@@ -8,8 +8,9 @@ use bollard::API_DEFAULT_VERSION;
88use bollard:: Docker ;
99use bollard:: errors:: Error as BollardError ;
1010use bollard:: models:: {
11- ContainerCreateBody , DeviceRequest , HostConfig , HostConfigCgroupnsModeEnum ,
12- NetworkCreateRequest , NetworkDisconnectRequest , PortBinding , VolumeCreateRequest ,
11+ ContainerCreateBody , DeviceRequest , EndpointSettings , HostConfig , HostConfigCgroupnsModeEnum ,
12+ NetworkConnectRequest , NetworkCreateRequest , NetworkDisconnectRequest , PortBinding ,
13+ RestartPolicy , RestartPolicyNameEnum , VolumeCreateRequest ,
1314} ;
1415use bollard:: query_parameters:: {
1516 CreateContainerOptions , CreateImageOptions , InspectContainerOptions , InspectNetworkOptions ,
@@ -466,6 +467,9 @@ pub async fn ensure_image(
466467 Ok ( ( ) )
467468}
468469
470+ /// Returns the actual host port the container is using. When an existing
471+ /// container is reused (same image), this may differ from `gateway_port`
472+ /// because the container was originally created with a different port.
469473pub async fn ensure_container (
470474 docker : & Docker ,
471475 name : & str ,
@@ -478,7 +482,7 @@ pub async fn ensure_container(
478482 registry_username : Option < & str > ,
479483 registry_token : Option < & str > ,
480484 device_ids : & [ String ] ,
481- ) -> Result < ( ) > {
485+ ) -> Result < u16 > {
482486 let container_name = container_name ( name) ;
483487
484488 // Check if the container already exists
@@ -505,10 +509,37 @@ pub async fn ensure_container(
505509 } ;
506510
507511 if image_matches {
508- return Ok ( ( ) ) ;
512+ // The container exists with the correct image, but its network
513+ // attachment may be stale. When the gateway is resumed after a
514+ // container kill, `ensure_network` destroys and recreates the
515+ // Docker network (giving it a new ID). The stopped container
516+ // still references the old network ID, so `docker start` would
517+ // fail with "network <old-id> not found".
518+ //
519+ // Fix: disconnect from any existing networks and reconnect to
520+ // the current (just-created) network before returning.
521+ let expected_net = network_name ( name) ;
522+ reconcile_container_network ( docker, & container_name, & expected_net) . await ?;
523+
524+ // Read the actual host port from the container's port bindings
525+ // as a cross-check. The caller should already pass the correct
526+ // port (from stored metadata), but this catches mismatches if
527+ // the container was recreated with a different port externally.
528+ let actual_port = info
529+ . host_config
530+ . as_ref ( )
531+ . and_then ( |hc| hc. port_bindings . as_ref ( ) )
532+ . and_then ( |pb| pb. get ( "30051/tcp" ) )
533+ . and_then ( |bindings| bindings. as_ref ( ) )
534+ . and_then ( |bindings| bindings. first ( ) )
535+ . and_then ( |b| b. host_port . as_ref ( ) )
536+ . and_then ( |p| p. parse :: < u16 > ( ) . ok ( ) )
537+ . unwrap_or ( gateway_port) ;
538+
539+ return Ok ( actual_port) ;
509540 }
510541
511- // Image changed — remove the stale container so we can recreate it
542+ // Image changed — remove the stale container so we can recreate it.
512543 tracing:: info!(
513544 "Container {} exists but uses a different image (container={}, desired={}), recreating" ,
514545 container_name,
@@ -555,6 +586,12 @@ pub async fn ensure_container(
555586 port_bindings : Some ( port_bindings) ,
556587 binds : Some ( vec ! [ format!( "{}:/var/lib/rancher/k3s" , volume_name( name) ) ] ) ,
557588 network_mode : Some ( network_name ( name) ) ,
589+ // Automatically restart the container when Docker restarts, unless the
590+ // user explicitly stopped it with `gateway stop`.
591+ restart_policy : Some ( RestartPolicy {
592+ name : Some ( RestartPolicyNameEnum :: UNLESS_STOPPED ) ,
593+ maximum_retry_count : None ,
594+ } ) ,
558595 // Add host gateway aliases for DNS resolution.
559596 // This allows both the entrypoint script and the running gateway
560597 // process to reach services on the Docker host.
@@ -734,7 +771,7 @@ pub async fn ensure_container(
734771 . await
735772 . into_diagnostic ( )
736773 . wrap_err ( "failed to create gateway container" ) ?;
737- Ok ( ( ) )
774+ Ok ( gateway_port )
738775}
739776
740777/// Information about a container that is holding a port we need.
@@ -956,6 +993,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
956993 Ok ( ( ) )
957994}
958995
996+ /// Clean up the gateway container and network, preserving the persistent volume.
997+ ///
998+ /// Used when a resume attempt fails — we want to remove the container we may
999+ /// have just created but keep the volume so the user can retry without losing
1000+ /// their k3s/etcd state and sandbox data.
1001+ pub async fn cleanup_gateway_container ( docker : & Docker , name : & str ) -> Result < ( ) > {
1002+ let container_name = container_name ( name) ;
1003+ let net_name = network_name ( name) ;
1004+
1005+ // Disconnect container from network
1006+ let _ = docker
1007+ . disconnect_network (
1008+ & net_name,
1009+ NetworkDisconnectRequest {
1010+ container : container_name. clone ( ) ,
1011+ force : Some ( true ) ,
1012+ } ,
1013+ )
1014+ . await ;
1015+
1016+ let _ = stop_container ( docker, & container_name) . await ;
1017+
1018+ let remove_container = docker
1019+ . remove_container (
1020+ & container_name,
1021+ Some ( RemoveContainerOptions {
1022+ force : true ,
1023+ ..Default :: default ( )
1024+ } ) ,
1025+ )
1026+ . await ;
1027+ if let Err ( err) = remove_container
1028+ && !is_not_found ( & err)
1029+ {
1030+ return Err ( err) . into_diagnostic ( ) ;
1031+ }
1032+
1033+ force_remove_network ( docker, & net_name) . await ?;
1034+
1035+ Ok ( ( ) )
1036+ }
1037+
9591038/// Forcefully remove a Docker network, disconnecting any remaining
9601039/// containers first. This ensures that stale Docker network endpoints
9611040/// cannot prevent port bindings from being released.
@@ -993,6 +1072,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> {
9931072 }
9941073}
9951074
1075+ /// Ensure a stopped container is connected to the expected Docker network.
1076+ ///
1077+ /// When a gateway is resumed after the container was killed (but not removed),
1078+ /// `ensure_network` destroys and recreates the network with a new ID. The
1079+ /// stopped container still holds a reference to the old network ID in its
1080+ /// config, so `docker start` would fail with a 404 "network not found" error.
1081+ ///
1082+ /// This function disconnects the container from any networks that no longer
1083+ /// match the expected network name and connects it to the correct one.
1084+ async fn reconcile_container_network (
1085+ docker : & Docker ,
1086+ container_name : & str ,
1087+ expected_network : & str ,
1088+ ) -> Result < ( ) > {
1089+ let info = docker
1090+ . inspect_container ( container_name, None :: < InspectContainerOptions > )
1091+ . await
1092+ . into_diagnostic ( )
1093+ . wrap_err ( "failed to inspect container for network reconciliation" ) ?;
1094+
1095+ // Check the container's current network attachments via NetworkSettings.
1096+ let attached_networks: Vec < String > = info
1097+ . network_settings
1098+ . as_ref ( )
1099+ . and_then ( |ns| ns. networks . as_ref ( ) )
1100+ . map ( |nets| nets. keys ( ) . cloned ( ) . collect ( ) )
1101+ . unwrap_or_default ( ) ;
1102+
1103+ // If the container is already attached to the expected network (by name),
1104+ // Docker will resolve the name to the current network ID on start.
1105+ // However, when the network was destroyed and recreated, the container's
1106+ // stored endpoint references the old ID. Disconnect and reconnect to
1107+ // pick up the new network ID.
1108+ for net_name in & attached_networks {
1109+ let _ = docker
1110+ . disconnect_network (
1111+ net_name,
1112+ NetworkDisconnectRequest {
1113+ container : container_name. to_string ( ) ,
1114+ force : Some ( true ) ,
1115+ } ,
1116+ )
1117+ . await ;
1118+ }
1119+
1120+ // Connect to the (freshly created) expected network.
1121+ docker
1122+ . connect_network (
1123+ expected_network,
1124+ NetworkConnectRequest {
1125+ container : container_name. to_string ( ) ,
1126+ endpoint_config : Some ( EndpointSettings :: default ( ) ) ,
1127+ } ,
1128+ )
1129+ . await
1130+ . into_diagnostic ( )
1131+ . wrap_err ( "failed to connect container to gateway network" ) ?;
1132+
1133+ tracing:: debug!(
1134+ "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}"
1135+ ) ;
1136+
1137+ Ok ( ( ) )
1138+ }
1139+
9961140fn is_not_found ( err : & BollardError ) -> bool {
9971141 matches ! (
9981142 err,
0 commit comments