diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml index 8ae7f888ab..1ce5fb5f7b 100644 --- a/.github/workflows/flyte-binary-v2.yml +++ b/.github/workflows/flyte-binary-v2.yml @@ -185,6 +185,12 @@ jobs: run: | mkdir -p /tmp/cpu-oci tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci + - name: Upload CPU OCI archive for integration test + uses: actions/upload-artifact@v4 + with: + name: devbox-cpu-oci + path: /tmp/cpu-oci.tar + retention-days: 1 - name: Push CPU multi-arch image if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} uses: docker/build-push-action@v6 @@ -225,3 +231,105 @@ jobs: push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} cache-from: type=gha,scope=demo-gpu cache-to: type=gha,mode=max,scope=demo-gpu + + devbox-integration-test: + runs-on: ubuntu-latest + needs: [build-and-push-devbox-bundled-image] + timeout-minutes: 25 + env: + DEVBOX_IMAGE: flyte-devbox:ci + FLYTE_WORKER_IMAGE: ghcr.io/flyteorg/flyte:py3.11-v2.0.0b55 + FLYTE_SDK_VERSION: "2.0.0b55" + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: devbox-cpu-oci + path: /tmp + - name: Load devbox image into Docker + run: | + sudo apt-get update + sudo apt-get install -y skopeo + mkdir -p /tmp/cpu-oci + tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci + skopeo copy --override-os linux --override-arch amd64 \ + oci:/tmp/cpu-oci docker-daemon:${{ env.DEVBOX_IMAGE }} + docker images ${{ env.DEVBOX_IMAGE }} + - name: Start devbox cluster + run: ./runs/test/devbox/start-devbox.sh + - name: Pre-pull worker image into k3s containerd + # Avoids the in-cluster pull dominating the test budget when the run is submitted. + run: docker exec flyte-devbox crictl pull ${{ env.FLYTE_WORKER_IMAGE }} + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install flyte SDK + run: pip install "flyte==${{ env.FLYTE_SDK_VERSION }}" + - name: Configure SDK + # Talk directly to flyte-binary (port-forwarded by start-devbox.sh). + # The Traefik NodePort 30080 doesn't speak h2c reliably for gRPC, so + # SDK calls fall over with cryptic ValueErrors when routed through it. + run: | + mkdir -p "$HOME/.flyte" + cat > "$HOME/.flyte/config.yaml" <<'EOF' + admin: + endpoint: dns:///localhost:8090 + insecure: True + task: + domain: development + project: flytesnacks + org: localhost + image: + builder: local + EOF + - name: Probe DataProxy reachability + # Diagnostic: hit DataProxy.CreateUploadLocation via plain HTTP/1.1 + # Connect protocol on both endpoints. This bypasses h2c questions and + # tells us (a) whether DataProxy is functional, (b) what host the + # signed URL points at, (c) whether the rustfs.flyte:9000 bridge is + # actually wired up correctly. + run: | + set +e + set -x + payload='{"project":"flytesnacks","domain":"development","org":"localhost","filename":"probe.tar.gz","filename_root":"ci-probe","content_md5":"dGVzdC1jb250ZW50LWhhc2g=","expires_in":"1800s","content_length":1024}' + for ep in "http://localhost:8090" "http://localhost:30080"; do + echo "::group::CreateUploadLocation @ $ep" + curl -sS -i \ + -H 'Content-Type: application/json' \ + -X POST "$ep/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \ + --data "$payload" + echo + echo "::endgroup::" + done + echo "::group::PUT to signed URL via rustfs.flyte:9000 bridge" + signed_url=$(curl -sS \ + -H 'Content-Type: application/json' \ + -X POST "http://localhost:8090/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \ + --data "$payload" | python -c 'import sys,json; print(json.load(sys.stdin).get("signedUrl",""))') + echo "signed_url=$signed_url" + if [ -n "$signed_url" ]; then + curl -sS -i -X PUT --data-binary "hello" "$signed_url" || true + fi + echo "::endgroup::" + echo "::group::rustfs.flyte resolution + reachability" + getent hosts rustfs.flyte || true + curl -sS -i http://rustfs.flyte:9000/ || true + echo "::endgroup::" + - name: Submit task and wait for SUCCEEDED + run: python runs/test/devbox/hello.py + - name: Diagnostics on failure + if: failure() + run: | + echo "::group::docker logs" + docker logs flyte-devbox 2>&1 | tail -200 || true + echo "::endgroup::" + echo "::group::pods" + kubectl get pods -A || true + echo "::endgroup::" + echo "::group::flyte-binary logs" + kubectl logs -n flyte deploy/flyte-binary --tail=500 || true + echo "::endgroup::" + echo "::group::describe failing pods" + kubectl get pods -A --no-headers | awk '$4 != "Running" && $4 != "Completed"' \ + | while read ns name _; do kubectl describe pod -n "$ns" "$name" || true; done + echo "::endgroup::" diff --git a/runs/test/devbox/hello.py b/runs/test/devbox/hello.py new file mode 100644 index 0000000000..b88ad39945 --- /dev/null +++ b/runs/test/devbox/hello.py @@ -0,0 +1,52 @@ +"""Submits a tiny task to a running devbox and asserts it reaches SUCCEEDED. + +Used by .github/workflows/flyte-binary-v2.yml as the post-build integration +gate: if this script exits non-zero, the just-built devbox image is broken. + +The SDK reads connection info from $HOME/.flyte/config.yaml. Storage for +fast-registration uploads is handled server-side by the devbox's DataProxy + +rustfs, so the SDK doesn't need explicit S3 credentials. The worker image +comes from $FLYTE_WORKER_IMAGE so CI can pre-pull it into k3s before +submission and keep this script aligned with whatever tag the workflow +loaded. +""" +import os +import sys + +import flyte + +WORKER_IMAGE = os.environ["FLYTE_WORKER_IMAGE"] + +env = flyte.TaskEnvironment( + name="devbox_ci_smoke", + image=WORKER_IMAGE, +) + + +@env.task +def add_one(x: int) -> int: + return x + 1 + + +def main() -> int: + try: + flyte.init_from_config() + run = flyte.run(add_one, x=41) + except Exception as e: + # The SDK wraps storage errors with a generic message; walk the chain + # so CI logs show the real cause (network, signing, etc.). + cur, depth = e, 0 + while cur is not None and depth < 10: + print(f" [{depth}] {type(cur).__name__}: {cur}", file=sys.stderr) + cur = cur.__cause__ or cur.__context__ + depth += 1 + raise + print(f"run.result={run.result!r}") + if run.result != 42: + print(f"FAIL: expected 42, got {run.result!r}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/runs/test/devbox/start-devbox.sh b/runs/test/devbox/start-devbox.sh new file mode 100755 index 0000000000..8b5d620a41 --- /dev/null +++ b/runs/test/devbox/start-devbox.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# Boots the bundled flyte-devbox image as a single-container k3s cluster +# suitable for integration tests. Mirrors `make start` in +# docker/devbox-bundled/Makefile but headless and CI-friendly: writes +# kubeconfig to $PWD/.kube/kubeconfig, exports KUBECONFIG via $GITHUB_ENV, +# and waits for the flyte-binary pod to report Ready before returning. +set -euo pipefail + +IMAGE="${DEVBOX_IMAGE:-flyte-devbox:ci}" +NAME="${DEVBOX_NAME:-flyte-devbox}" +KUBE_DIR="${KUBE_DIR:-$PWD/.kube}" +READY_TIMEOUT="${READY_TIMEOUT:-300}" + +mkdir -p "$KUBE_DIR" +rm -f "$KUBE_DIR/kubeconfig" + +docker run -d --rm --privileged --name "$NAME" \ + --add-host host.docker.internal:host-gateway \ + -e K3S_KUBECONFIG_OUTPUT=/.kube/kubeconfig \ + -v "$KUBE_DIR":/.kube \ + -p 6443:6443 \ + -p 30000:30000 \ + -p 30001:5432 \ + -p 30002:30002 \ + -p 30080:30080 \ + -p 30081:30081 \ + "$IMAGE" + +echo "Waiting for kubeconfig (timeout ${READY_TIMEOUT}s)..." +deadline=$(( $(date +%s) + READY_TIMEOUT )) +until [ -s "$KUBE_DIR/kubeconfig" ]; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: kubeconfig not written within ${READY_TIMEOUT}s" >&2 + docker logs "$NAME" >&2 || true + exit 1 + fi + sleep 1 +done +docker exec "$NAME" chown "$(id -u):$(id -g)" /.kube/kubeconfig + +KUBECONFIG="$KUBE_DIR/kubeconfig" +export KUBECONFIG +if [ -n "${GITHUB_ENV:-}" ]; then + echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV" +fi + +echo "Waiting for flyte namespace..." +until kubectl get ns flyte >/dev/null 2>&1; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: flyte namespace not created within ${READY_TIMEOUT}s" >&2 + kubectl get ns >&2 || true + exit 1 + fi + sleep 2 +done + +echo "Waiting for flyte-binary deployment to exist..." +until kubectl get deploy -n flyte flyte-binary >/dev/null 2>&1; do + if [ "$(date +%s)" -gt "$deadline" ]; then + echo "ERROR: flyte-binary deployment not created within ${READY_TIMEOUT}s" >&2 + kubectl get all -A >&2 || true + exit 1 + fi + sleep 2 +done + +remaining=$(( deadline - $(date +%s) )) +[ "$remaining" -lt 30 ] && remaining=30 +echo "Waiting for flyte-binary rollout (timeout ${remaining}s)..." +kubectl rollout status deploy/flyte-binary -n flyte --timeout="${remaining}s" + +# Bridge rustfs.flyte:9000 -> localhost:30002 (the rustfs NodePort). +# DataProxy mints signed URLs whose host is the in-cluster storage endpoint +# (http://rustfs.flyte:9000), which is unreachable from the runner. We add a +# /etc/hosts entry and a TCP forwarder so the SDK's PUT to the signed URL +# resolves to the published NodePort and lands on the rustfs pod. +if ! grep -q '[[:space:]]rustfs\.flyte\b' /etc/hosts; then + echo "127.0.0.1 rustfs.flyte" | sudo tee -a /etc/hosts >/dev/null +fi +if ! command -v socat >/dev/null 2>&1; then + sudo apt-get update -qq && sudo apt-get install -y -qq socat +fi +nohup socat TCP-LISTEN:9000,reuseaddr,fork TCP:127.0.0.1:30002 \ + >/tmp/rustfs-forward.log 2>&1 & +disown +forward_deadline=$(( $(date +%s) + 15 )) +until nc -z 127.0.0.1 9000 2>/dev/null; do + if [ "$(date +%s)" -gt "$forward_deadline" ]; then + echo "ERROR: rustfs.flyte:9000 forward did not open" >&2 + cat /tmp/rustfs-forward.log >&2 || true + exit 1 + fi + sleep 0.3 +done + +# Port-forward directly to the flyte-binary ClusterIP service. +# The bundled Traefik on NodePort 30080 doesn't reliably do h2c, so the +# Python SDK's gRPC client (HTTP/2 cleartext) fails through it. Talking +# directly to svc/flyte-binary:8090 sidesteps the proxy entirely. +nohup kubectl port-forward -n flyte svc/flyte-binary-http 8090:8090 \ + --address 127.0.0.1 \ + >/tmp/flyte-binary-pf.log 2>&1 & +disown +pf_deadline=$(( $(date +%s) + 15 )) +until nc -z 127.0.0.1 8090 2>/dev/null; do + if [ "$(date +%s)" -gt "$pf_deadline" ]; then + echo "ERROR: flyte-binary port-forward did not open" >&2 + cat /tmp/flyte-binary-pf.log >&2 || true + exit 1 + fi + sleep 0.3 +done + +echo "Devbox ready." +echo " flyte-binary (direct): http://localhost:8090" +echo " flyte-binary (Traefik): http://localhost:30080" +echo " rustfs S3: http://localhost:30002 (also rustfs.flyte:9000)" +echo " Connect API: http://localhost:30080" +echo " rustfs S3: http://localhost:30002" +echo " Postgres: localhost:30001"