Skip to content

Commit

Permalink
feat(workflows): use built images in Github workflows (#11284)
Browse files Browse the repository at this point in the history
* Patch deployments and include debugging info

Signed-off-by: carter.fendley <[email protected]>

* Build and use driver / launcher too!

Signed-off-by: carter.fendley <[email protected]>

* Modify waiting status message

Signed-off-by: carter.fendley <[email protected]>

* Fix typo

Signed-off-by: carter.fendley <[email protected]>

---------

Signed-off-by: carter.fendley <[email protected]>
  • Loading branch information
CarterFendley authored Oct 11, 2024
1 parent 753a2f1 commit 1550b36
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 6 deletions.
13 changes: 13 additions & 0 deletions scripts/deploy/github/build-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ then
exit $EXIT_CODE
fi

docker build -q -t "${REGISTRY}/driver:${TAG}" -f backend/Dockerfile.driver . && docker push "${REGISTRY}/driver:${TAG}" || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Failed to build driver image."
exit $EXIT_CODE
fi

docker build -q -t "${REGISTRY}/launcher:${TAG}" -f backend/Dockerfile.launcher . && docker push "${REGISTRY}/launcher:${TAG}" || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Failed to build launcher image."
exit $EXIT_CODE
fi

# clean up intermittent build caches to free up disk space
docker system prune -a -f
12 changes: 12 additions & 0 deletions scripts/deploy/github/deploy-kfp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ then
exit 1
fi

echo "Patching deployments to use built docker images..."
# Patch API server
kubectl patch deployment ml-pipeline -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-api-server", "image": "kind-registry:5000/apiserver"}]}}}}' -n kubeflow
# Patch persistence agent
kubectl patch deployment.apps/ml-pipeline-persistenceagent -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-persistenceagent", "image": "kind-registry:5000/persistenceagent"}]}}}}' -n kubeflow
# Patch scheduled workflow
kubectl patch deployment.apps/ml-pipeline-scheduledworkflow -p '{"spec": {"template": {"spec": {"containers": [{"name": "ml-pipeline-scheduledworkflow", "image": "kind-registry:5000/scheduledworkflow"}]}}}}' -n kubeflow

# Update environment variables to override driver / launcher
kubectl set env deployments/ml-pipeline V2_DRIVER_IMAGE=kind-registry:5000/driver -n kubeflow
kubectl set env deployments/ml-pipeline V2_LAUNCHER_IMAGE=kind-registry:5000/launcher -n kubeflow

# Check if all pods are running - (10 minutes)
wait_for_pods || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
Expand Down
24 changes: 18 additions & 6 deletions scripts/deploy/github/kfp-readiness/wait_for_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,25 @@ def get_pod_statuses():
pod_name = pod.metadata.name
pod_status = pod.status.phase
container_statuses = pod.status.container_statuses or []
ready_containers = sum(1 for status in container_statuses if status.ready)
total_containers = len(container_statuses)
statuses[pod_name] = (pod_status, ready_containers, total_containers)
ready = 0
total = 0
waiting_messages = []
for status in container_statuses:
total += 1
if status.ready:
ready += 1
if status.state.waiting is not None:
if status.state.waiting.message is not None:
waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}: {status.state.waiting.message}')
else:
waiting_messages.append(f'Waiting on Container: {status.name} - {status.state.waiting.reason}')
statuses[pod_name] = (pod_status, ready, total, waiting_messages)
return statuses


def all_pods_ready(statuses):
return all(pod_status == 'Running' and ready == total
for pod_status, ready, total in statuses.values())
for pod_status, ready, total, _ in statuses.values())


def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
Expand All @@ -41,8 +51,10 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
current_statuses = get_pod_statuses()

logging.info("Checking pod statuses...")
for pod_name, (pod_status, ready, total) in current_statuses.items():
for pod_name, (pod_status, ready, total, waiting_messages) in current_statuses.items():
logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}")
for waiting_msg in waiting_messages:
logging.info(waiting_msg)

if current_statuses == previous_statuses:
if all_pods_ready(current_statuses):
Expand All @@ -65,7 +77,7 @@ def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
raise Exception("Pods did not stabilize within the timeout period.")

logging.info("Final pod statuses:")
for pod_name, (pod_status, ready, total) in previous_statuses.items():
for pod_name, (pod_status, ready, total, _) in previous_statuses.items():
if pod_status == 'Running' and ready == total:
logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
else:
Expand Down

0 comments on commit 1550b36

Please sign in to comment.