Skip to content

Commit 7541eb5

Browse files
fix(bootstrap): surface Helm install failure on namespace timeout (#211)
Signed-off-by: Manoj-engineer <194872717+Manoj-engineer@users.noreply.github.com>
1 parent de9dcaa commit 7541eb5

File tree

1 file changed

+149
-0
lines changed
  • crates/openshell-bootstrap/src

1 file changed

+149
-0
lines changed

crates/openshell-bootstrap/src/lib.rs

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,115 @@ async fn probe_container_dns(docker: &Docker, container_name: &str) -> Result<bo
950950
Ok(exit_code == 0 && output.contains("DNS_OK"))
951951
}
952952

953+
/// Query the status and logs of the `helm-install-<chart>` Job(s) that k3s runs
954+
/// at startup to deploy the embedded Helm charts (e.g. the openshell chart).
955+
///
956+
/// When the Job has failed we return a formatted string containing the Job
957+
/// failure reason and the last 30 lines of its pod logs so that callers can
958+
/// surface this as the *real* cause of the "namespace not ready" timeout.
959+
///
960+
/// Returns `None` when:
961+
/// - the exec into the container itself fails (container not running), or
962+
/// - no failed Helm install Job is found.
963+
async fn diagnose_helm_failure(
964+
docker: &Docker,
965+
container_name: &str,
966+
kubeconfig: &str,
967+
) -> Option<String> {
968+
// Find Helm install Jobs with a numeric non-zero status.failed count.
969+
// Using `$2 ~ /^[1-9]/` avoids false positives from successful jobs where
970+
// status.failed is absent and kubectl prints "<none>" for that column.
971+
let (job_output, job_exit) = exec_capture_with_exit(
972+
docker,
973+
container_name,
974+
vec![
975+
"sh".to_string(),
976+
"-c".to_string(),
977+
format!(
978+
"KUBECONFIG={kubeconfig} kubectl get jobs -n kube-system \
979+
--no-headers -o custom-columns=NAME:.metadata.name,FAILED:.status.failed \
980+
2>/dev/null | awk '{{if ($2 ~ /^[1-9]/) print $1}}'"
981+
),
982+
],
983+
)
984+
.await
985+
.ok()?;
986+
987+
if job_exit != 0 || job_output.trim().is_empty() {
988+
return None;
989+
}
990+
991+
// Collect failed Helm install jobs (k3s names them `helm-install-<chart>`).
992+
let failed_jobs: Vec<&str> = job_output
993+
.lines()
994+
.map(str::trim)
995+
.filter(|l| !l.is_empty() && l.starts_with("helm-install-"))
996+
.collect();
997+
998+
if failed_jobs.is_empty() {
999+
return None;
1000+
}
1001+
1002+
let mut parts: Vec<String> = Vec::new();
1003+
1004+
for job in &failed_jobs {
1005+
// Get the Job's status conditions for a concise failure reason.
1006+
let cond_output = exec_capture_with_exit(
1007+
docker,
1008+
container_name,
1009+
vec![
1010+
"sh".to_string(),
1011+
"-c".to_string(),
1012+
format!(
1013+
"KUBECONFIG={kubeconfig} kubectl get job {job} -n kube-system \
1014+
-o jsonpath='{{range .status.conditions[*]}}{{.type}}: {{.message}}{{\"\\n\"}}{{end}}' \
1015+
2>/dev/null"
1016+
),
1017+
],
1018+
)
1019+
.await
1020+
.map(|(out, _)| out)
1021+
.unwrap_or_default();
1022+
1023+
// Get the last 30 lines of logs from the Job's pod(s).
1024+
let log_output = exec_capture_with_exit(
1025+
docker,
1026+
container_name,
1027+
vec![
1028+
"sh".to_string(),
1029+
"-c".to_string(),
1030+
format!(
1031+
"KUBECONFIG={kubeconfig} kubectl logs -n kube-system \
1032+
-l job-name={job} --tail=30 2>&1"
1033+
),
1034+
],
1035+
)
1036+
.await
1037+
.map(|(out, _)| out)
1038+
.unwrap_or_default();
1039+
1040+
let mut section = format!("Job {job} failed.");
1041+
let cond = cond_output.trim();
1042+
if !cond.is_empty() {
1043+
section.push_str(&format!("\n Status: {}", cond.replace('\n', "\n ")));
1044+
}
1045+
let logs = log_output.trim();
1046+
if !logs.is_empty() {
1047+
section.push_str("\n Last job logs:");
1048+
for line in logs.lines().take(30) {
1049+
section.push_str(&format!("\n {line}"));
1050+
}
1051+
}
1052+
parts.push(section);
1053+
}
1054+
1055+
if parts.is_empty() {
1056+
None
1057+
} else {
1058+
Some(parts.join("\n\n"))
1059+
}
1060+
}
1061+
9531062
async fn wait_for_namespace(
9541063
docker: &Docker,
9551064
container_name: &str,
@@ -1040,6 +1149,20 @@ async fn wait_for_namespace(
10401149
}
10411150

10421151
if attempt + 1 == attempts {
1152+
// Before returning a generic timeout error, check whether a Helm
1153+
// install job failed. If so, surface the real Helm error so the
1154+
// user doesn't have to dig through job logs manually.
1155+
let helm_hint = diagnose_helm_failure(docker, container_name, kubeconfig).await;
1156+
if let Some(hint) = helm_hint {
1157+
return Err(miette::miette!(
1158+
"timed out waiting for namespace '{namespace}' to exist.\n\n\
1159+
A Helm install job appears to have failed — this is likely the root cause:\n\n\
1160+
{hint}\n\n\
1161+
To inspect the full job logs run:\n \
1162+
kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1163+
))
1164+
.wrap_err("K8s namespace not ready");
1165+
}
10431166
let logs = fetch_recent_logs(docker, container_name, 40).await;
10441167
return Err(miette::miette!(
10451168
"timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}"
@@ -1077,4 +1200,30 @@ mod tests {
10771200
);
10781201
}
10791202
}
1203+
1204+
/// Simulate the error message shape produced by `diagnose_helm_failure` and
1205+
/// ensure that `diagnose_failure` (in errors.rs) does not suppress or
1206+
/// override it — the Helm hint is intentionally surfaced verbatim inside
1207+
/// the `wait_for_namespace` timeout error, so we only need to verify the
1208+
/// string construction here rather than end-to-end container exec.
1209+
#[test]
1210+
fn helm_failure_hint_is_included_in_namespace_timeout_message() {
1211+
// Replicate the error message that `wait_for_namespace` would produce
1212+
// when `diagnose_helm_failure` returns a non-None hint.
1213+
let helm_hint = "Job helm-install-openshell failed.\n \
1214+
Status: Failed: error validating \"\": apiVersion not set\n \
1215+
Last job logs:\n Error: INSTALLATION FAILED: unable to build kubernetes \
1216+
objects from release manifest: error validating data: apiVersion not set";
1217+
let error_msg = format!(
1218+
"timed out waiting for namespace 'openshell' to exist.\n\n\
1219+
A Helm install job appears to have failed — this is likely the root cause:\n\n\
1220+
{helm_hint}\n\n\
1221+
To inspect the full job logs run:\n \
1222+
kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1223+
);
1224+
assert!(error_msg.contains("helm-install-openshell"));
1225+
assert!(error_msg.contains("apiVersion not set"));
1226+
assert!(error_msg.contains("INSTALLATION FAILED"));
1227+
assert!(error_msg.contains("kubectl logs -n kube-system"));
1228+
}
10801229
}

0 commit comments

Comments
 (0)