diff --git a/scripts/deploy/github/build-images.sh b/scripts/deploy/github/build-images.sh index 1f4e7318e8e..a70d295c291 100755 --- a/scripts/deploy/github/build-images.sh +++ b/scripts/deploy/github/build-images.sh @@ -46,6 +46,19 @@ then exit $EXIT_CODE fi +docker build -q -t "${REGISTRY}/driver:${TAG}" -f backend/Dockerfile.driver . && docker push "${REGISTRY}/driver:${TAG}" || EXIT_CODE=$? +if [[ $EXIT_CODE -ne 0 ]] +then + echo "Failed to build driver image." + exit $EXIT_CODE +fi + +docker build -q -t "${REGISTRY}/launcher:${TAG}" -f backend/Dockerfile.launcher . && docker push "${REGISTRY}/launcher:${TAG}" || EXIT_CODE=$? +if [[ $EXIT_CODE -ne 0 ]] +then + echo "Failed to build launcher image." + exit $EXIT_CODE +fi # clean up intermittent build caches to free up disk space docker system prune -a -f diff --git a/scripts/deploy/github/deploy-kfp.sh b/scripts/deploy/github/deploy-kfp.sh index 6acd46293ba..6206b92fe58 100755 --- a/scripts/deploy/github/deploy-kfp.sh +++ b/scripts/deploy/github/deploy-kfp.sh @@ -41,6 +41,18 @@ then exit 1 fi +echo "Patching deployments to use built docker images..." +# Patch API server +kubectl patch deployment ml-pipeline -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-api-server", "image": "kind-registry:5000/apiserver"}]}}}}' -n kubeflow +# Patch persistence agent +kubectl patch deployment.apps/ml-pipeline-persistenceagent -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-persistenceagent", "image": "kind-registry:5000/persistenceagent"}]}}}}' -n kubeflow +# Patch scheduled workflow +kubectl patch deployment.apps/ml-pipeline-scheduledworkflow -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-scheduledworkflow", "image": "kind-registry:5000/scheduledworkflow"}]}}}}' -n kubeflow + +# Update environment variables to override driver / launcher +kubectl set env deployments/ml-pipeline V2_DRIVER_IMAGE=kind-registry:5000/driver -n kubeflow +kubectl set env deployments/ml-pipeline V2_LAUNCHER_IMAGE=kind-registry:5000/launcher -n kubeflow + # Check if all pods are running - (10 minutes) wait_for_pods || EXIT_CODE=$? if [[ $EXIT_CODE -ne 0 ]] diff --git a/scripts/deploy/github/kfp-readiness/wait_for_pods.py b/scripts/deploy/github/kfp-readiness/wait_for_pods.py index 3a61086afbb..ebc7546a300 100644 --- a/scripts/deploy/github/kfp-readiness/wait_for_pods.py +++ b/scripts/deploy/github/kfp-readiness/wait_for_pods.py @@ -21,15 +21,25 @@ def get_pod_statuses(): pod_name = pod.metadata.name pod_status = pod.status.phase container_statuses = pod.status.container_statuses or [] - ready_containers = sum(1 for status in container_statuses if status.ready) - total_containers = len(container_statuses) - statuses[pod_name] = (pod_status, ready_containers, total_containers) + ready = 0 + total = 0 + waiting_messages = [] + for status in container_statuses: + total += 1 + if status.ready: + ready += 1 + if status.state.waiting is not None: + if status.state.waiting.message is not None: + waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}: {status.state.waiting.message}') + else: + waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}') + statuses[pod_name] = (pod_status, ready, total, waiting_messages) return statuses def all_pods_ready(statuses): return all(pod_status == 'Running' and ready == total - for pod_status, ready, total in statuses.values()) + for pod_status, ready, total, _ in statuses.values()) def check_pods(calm_time=10, timeout=600, retries_after_ready=5): @@ -41,8 +51,10 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5): current_statuses = get_pod_statuses() logging.info("Checking pod statuses...") - for pod_name, (pod_status, ready, total) in current_statuses.items(): + for pod_name, (pod_status, ready, total, waiting_messages) in current_statuses.items(): logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}") + for waiting_msg in waiting_messages: + logging.info(waiting_msg) if current_statuses == previous_statuses: if all_pods_ready(current_statuses): @@ -65,7 +77,7 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5): raise Exception("Pods did not stabilize within the timeout period.") logging.info("Final pod statuses:") - for pod_name, (pod_status, ready, total) in previous_statuses.items(): + for pod_name, (pod_status, ready, total, _) in previous_statuses.items(): if pod_status == 'Running' and ready == total: logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})") else: