@@ -950,6 +950,115 @@ async fn probe_container_dns(docker: &Docker, container_name: &str) -> Result<bo
950950 Ok ( exit_code == 0 && output. contains ( "DNS_OK" ) )
951951}
952952
953+ /// Query the status and logs of the `helm-install-<chart>` Job(s) that k3s runs
954+ /// at startup to deploy the embedded Helm charts (e.g. the openshell chart).
955+ ///
956+ /// When the Job has failed we return a formatted string containing the Job
957+ /// failure reason and the last 30 lines of its pod logs so that callers can
958+ /// surface this as the *real* cause of the "namespace not ready" timeout.
959+ ///
960+ /// Returns `None` when:
961+ /// - the exec into the container itself fails (container not running), or
962+ /// - no failed Helm install Job is found.
963+ async fn diagnose_helm_failure (
964+ docker : & Docker ,
965+ container_name : & str ,
966+ kubeconfig : & str ,
967+ ) -> Option < String > {
968+ // Find Helm install Jobs with a numeric non-zero status.failed count.
969+ // Using `$2 ~ /^[1-9]/` avoids false positives from successful jobs where
970+ // status.failed is absent and kubectl prints "<none>" for that column.
971+ let ( job_output, job_exit) = exec_capture_with_exit (
972+ docker,
973+ container_name,
974+ vec ! [
975+ "sh" . to_string( ) ,
976+ "-c" . to_string( ) ,
977+ format!(
978+ "KUBECONFIG={kubeconfig} kubectl get jobs -n kube-system \
979+ --no-headers -o custom-columns=NAME:.metadata.name,FAILED:.status.failed \
980+ 2>/dev/null | awk '{{if ($2 ~ /^[1-9]/) print $1}}'"
981+ ) ,
982+ ] ,
983+ )
984+ . await
985+ . ok ( ) ?;
986+
987+ if job_exit != 0 || job_output. trim ( ) . is_empty ( ) {
988+ return None ;
989+ }
990+
991+ // Collect failed Helm install jobs (k3s names them `helm-install-<chart>`).
992+ let failed_jobs: Vec < & str > = job_output
993+ . lines ( )
994+ . map ( str:: trim)
995+ . filter ( |l| !l. is_empty ( ) && l. starts_with ( "helm-install-" ) )
996+ . collect ( ) ;
997+
998+ if failed_jobs. is_empty ( ) {
999+ return None ;
1000+ }
1001+
1002+ let mut parts: Vec < String > = Vec :: new ( ) ;
1003+
1004+ for job in & failed_jobs {
1005+ // Get the Job's status conditions for a concise failure reason.
1006+ let cond_output = exec_capture_with_exit (
1007+ docker,
1008+ container_name,
1009+ vec ! [
1010+ "sh" . to_string( ) ,
1011+ "-c" . to_string( ) ,
1012+ format!(
1013+ "KUBECONFIG={kubeconfig} kubectl get job {job} -n kube-system \
1014+ -o jsonpath='{{range .status.conditions[*]}}{{.type}}: {{.message}}{{\" \\ n\" }}{{end}}' \
1015+ 2>/dev/null"
1016+ ) ,
1017+ ] ,
1018+ )
1019+ . await
1020+ . map ( |( out, _) | out)
1021+ . unwrap_or_default ( ) ;
1022+
1023+ // Get the last 30 lines of logs from the Job's pod(s).
1024+ let log_output = exec_capture_with_exit (
1025+ docker,
1026+ container_name,
1027+ vec ! [
1028+ "sh" . to_string( ) ,
1029+ "-c" . to_string( ) ,
1030+ format!(
1031+ "KUBECONFIG={kubeconfig} kubectl logs -n kube-system \
1032+ -l job-name={job} --tail=30 2>&1"
1033+ ) ,
1034+ ] ,
1035+ )
1036+ . await
1037+ . map ( |( out, _) | out)
1038+ . unwrap_or_default ( ) ;
1039+
1040+ let mut section = format ! ( "Job {job} failed." ) ;
1041+ let cond = cond_output. trim ( ) ;
1042+ if !cond. is_empty ( ) {
1043+ section. push_str ( & format ! ( "\n Status: {}" , cond. replace( '\n' , "\n " ) ) ) ;
1044+ }
1045+ let logs = log_output. trim ( ) ;
1046+ if !logs. is_empty ( ) {
1047+ section. push_str ( "\n Last job logs:" ) ;
1048+ for line in logs. lines ( ) . take ( 30 ) {
1049+ section. push_str ( & format ! ( "\n {line}" ) ) ;
1050+ }
1051+ }
1052+ parts. push ( section) ;
1053+ }
1054+
1055+ if parts. is_empty ( ) {
1056+ None
1057+ } else {
1058+ Some ( parts. join ( "\n \n " ) )
1059+ }
1060+ }
1061+
9531062async fn wait_for_namespace (
9541063 docker : & Docker ,
9551064 container_name : & str ,
@@ -1040,6 +1149,20 @@ async fn wait_for_namespace(
10401149 }
10411150
10421151 if attempt + 1 == attempts {
1152+ // Before returning a generic timeout error, check whether a Helm
1153+ // install job failed. If so, surface the real Helm error so the
1154+ // user doesn't have to dig through job logs manually.
1155+ let helm_hint = diagnose_helm_failure ( docker, container_name, kubeconfig) . await ;
1156+ if let Some ( hint) = helm_hint {
1157+ return Err ( miette:: miette!(
1158+ "timed out waiting for namespace '{namespace}' to exist.\n \n \
1159+ A Helm install job appears to have failed — this is likely the root cause:\n \n \
1160+ {hint}\n \n \
1161+ To inspect the full job logs run:\n \
1162+ kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1163+ ) )
1164+ . wrap_err ( "K8s namespace not ready" ) ;
1165+ }
10431166 let logs = fetch_recent_logs ( docker, container_name, 40 ) . await ;
10441167 return Err ( miette:: miette!(
10451168 "timed out waiting for namespace '{namespace}' to exist: {output}\n {logs}"
@@ -1077,4 +1200,30 @@ mod tests {
10771200 ) ;
10781201 }
10791202 }
1203+
1204+ /// Simulate the error message shape produced by `diagnose_helm_failure` and
1205+ /// ensure that `diagnose_failure` (in errors.rs) does not suppress or
1206+ /// override it — the Helm hint is intentionally surfaced verbatim inside
1207+ /// the `wait_for_namespace` timeout error, so we only need to verify the
1208+ /// string construction here rather than end-to-end container exec.
1209+ #[ test]
1210+ fn helm_failure_hint_is_included_in_namespace_timeout_message ( ) {
1211+ // Replicate the error message that `wait_for_namespace` would produce
1212+ // when `diagnose_helm_failure` returns a non-None hint.
1213+ let helm_hint = "Job helm-install-openshell failed.\n \
1214+ Status: Failed: error validating \" \" : apiVersion not set\n \
1215+ Last job logs:\n Error: INSTALLATION FAILED: unable to build kubernetes \
1216+ objects from release manifest: error validating data: apiVersion not set";
1217+ let error_msg = format ! (
1218+ "timed out waiting for namespace 'openshell' to exist.\n \n \
1219+ A Helm install job appears to have failed — this is likely the root cause:\n \n \
1220+ {helm_hint}\n \n \
1221+ To inspect the full job logs run:\n \
1222+ kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1223+ ) ;
1224+ assert ! ( error_msg. contains( "helm-install-openshell" ) ) ;
1225+ assert ! ( error_msg. contains( "apiVersion not set" ) ) ;
1226+ assert ! ( error_msg. contains( "INSTALLATION FAILED" ) ) ;
1227+ assert ! ( error_msg. contains( "kubectl logs -n kube-system" ) ) ;
1228+ }
10801229}
0 commit comments