Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions .github/workflows/flyte-binary-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ jobs:
run: |
mkdir -p /tmp/cpu-oci
tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci
- name: Upload CPU OCI archive for integration test
uses: actions/upload-artifact@v4
with:
name: devbox-cpu-oci
path: /tmp/cpu-oci.tar
retention-days: 1
- name: Push CPU multi-arch image
if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6
Expand Down Expand Up @@ -225,3 +231,105 @@ jobs:
push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
cache-from: type=gha,scope=demo-gpu
cache-to: type=gha,mode=max,scope=demo-gpu

devbox-integration-test:
runs-on: ubuntu-latest
needs: [build-and-push-devbox-bundled-image]
timeout-minutes: 25
env:
DEVBOX_IMAGE: flyte-devbox:ci
FLYTE_WORKER_IMAGE: ghcr.io/flyteorg/flyte:py3.11-v2.0.0b55
FLYTE_SDK_VERSION: "2.0.0b55"
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
name: devbox-cpu-oci
path: /tmp
- name: Load devbox image into Docker
run: |
sudo apt-get update
sudo apt-get install -y skopeo
mkdir -p /tmp/cpu-oci
tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci
skopeo copy --override-os linux --override-arch amd64 \
oci:/tmp/cpu-oci docker-daemon:${{ env.DEVBOX_IMAGE }}
docker images ${{ env.DEVBOX_IMAGE }}
- name: Start devbox cluster
run: ./runs/test/devbox/start-devbox.sh
- name: Pre-pull worker image into k3s containerd
# Avoids the in-cluster pull dominating the test budget when the run is submitted.
run: docker exec flyte-devbox crictl pull ${{ env.FLYTE_WORKER_IMAGE }}
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install flyte SDK
run: pip install "flyte==${{ env.FLYTE_SDK_VERSION }}"
- name: Configure SDK
# Talk directly to flyte-binary (port-forwarded by start-devbox.sh).
# The Traefik NodePort 30080 doesn't speak h2c reliably for gRPC, so
# SDK calls fall over with cryptic ValueErrors when routed through it.
run: |
mkdir -p "$HOME/.flyte"
cat > "$HOME/.flyte/config.yaml" <<'EOF'
admin:
endpoint: dns:///localhost:8090
insecure: True
task:
domain: development
project: flytesnacks
org: localhost
image:
builder: local
EOF
- name: Probe DataProxy reachability
# Diagnostic: hit DataProxy.CreateUploadLocation via plain HTTP/1.1
# Connect protocol on both endpoints. This bypasses h2c questions and
# tells us (a) whether DataProxy is functional, (b) what host the
# signed URL points at, (c) whether the rustfs.flyte:9000 bridge is
# actually wired up correctly.
run: |
set +e
set -x
payload='{"project":"flytesnacks","domain":"development","org":"localhost","filename":"probe.tar.gz","filename_root":"ci-probe","content_md5":"dGVzdC1jb250ZW50LWhhc2g=","expires_in":"1800s","content_length":1024}'
for ep in "http://localhost:8090" "http://localhost:30080"; do
echo "::group::CreateUploadLocation @ $ep"
curl -sS -i \
-H 'Content-Type: application/json' \
-X POST "$ep/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \
--data "$payload"
echo
echo "::endgroup::"
done
echo "::group::PUT to signed URL via rustfs.flyte:9000 bridge"
signed_url=$(curl -sS \
-H 'Content-Type: application/json' \
-X POST "http://localhost:8090/flyteidl2.dataproxy.DataProxyService/CreateUploadLocation" \
--data "$payload" | python -c 'import sys,json; print(json.load(sys.stdin).get("signedUrl",""))')
echo "signed_url=$signed_url"
if [ -n "$signed_url" ]; then
curl -sS -i -X PUT --data-binary "hello" "$signed_url" || true
fi
echo "::endgroup::"
echo "::group::rustfs.flyte resolution + reachability"
getent hosts rustfs.flyte || true
curl -sS -i http://rustfs.flyte:9000/ || true
echo "::endgroup::"
- name: Submit task and wait for SUCCEEDED
run: python runs/test/devbox/hello.py
- name: Diagnostics on failure
if: failure()
run: |
echo "::group::docker logs"
docker logs flyte-devbox 2>&1 | tail -200 || true
echo "::endgroup::"
echo "::group::pods"
kubectl get pods -A || true
echo "::endgroup::"
echo "::group::flyte-binary logs"
kubectl logs -n flyte deploy/flyte-binary --tail=500 || true
echo "::endgroup::"
echo "::group::describe failing pods"
kubectl get pods -A --no-headers | awk '$4 != "Running" && $4 != "Completed"' \
| while read ns name _; do kubectl describe pod -n "$ns" "$name" || true; done
echo "::endgroup::"
52 changes: 52 additions & 0 deletions runs/test/devbox/hello.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Submits a tiny task to a running devbox and asserts it reaches SUCCEEDED.

Used by .github/workflows/flyte-binary-v2.yml as the post-build integration
gate: if this script exits non-zero, the just-built devbox image is broken.

The SDK reads connection info from $HOME/.flyte/config.yaml. Storage for
fast-registration uploads is handled server-side by the devbox's DataProxy +
rustfs, so the SDK doesn't need explicit S3 credentials. The worker image
comes from $FLYTE_WORKER_IMAGE so CI can pre-pull it into k3s before
submission and keep this script aligned with whatever tag the workflow
loaded.
"""
import os
import sys

import flyte

WORKER_IMAGE = os.environ["FLYTE_WORKER_IMAGE"]

env = flyte.TaskEnvironment(
name="devbox_ci_smoke",
image=WORKER_IMAGE,
)


@env.task
def add_one(x: int) -> int:
return x + 1


def main() -> int:
try:
flyte.init_from_config()
run = flyte.run(add_one, x=41)
except Exception as e:
# The SDK wraps storage errors with a generic message; walk the chain
# so CI logs show the real cause (network, signing, etc.).
cur, depth = e, 0
while cur is not None and depth < 10:
print(f" [{depth}] {type(cur).__name__}: {cur}", file=sys.stderr)
cur = cur.__cause__ or cur.__context__
depth += 1
raise
print(f"run.result={run.result!r}")
if run.result != 42:
print(f"FAIL: expected 42, got {run.result!r}", file=sys.stderr)
return 1
return 0


if __name__ == "__main__":
sys.exit(main())
120 changes: 120 additions & 0 deletions runs/test/devbox/start-devbox.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash
# Boots the bundled flyte-devbox image as a single-container k3s cluster
# suitable for integration tests. Mirrors `make start` in
# docker/devbox-bundled/Makefile but headless and CI-friendly: writes
# kubeconfig to $PWD/.kube/kubeconfig, exports KUBECONFIG via $GITHUB_ENV,
# and waits for the flyte-binary pod to report Ready before returning.
set -euo pipefail

IMAGE="${DEVBOX_IMAGE:-flyte-devbox:ci}"
NAME="${DEVBOX_NAME:-flyte-devbox}"
KUBE_DIR="${KUBE_DIR:-$PWD/.kube}"
READY_TIMEOUT="${READY_TIMEOUT:-300}"

mkdir -p "$KUBE_DIR"
rm -f "$KUBE_DIR/kubeconfig"

docker run -d --rm --privileged --name "$NAME" \
--add-host host.docker.internal:host-gateway \
-e K3S_KUBECONFIG_OUTPUT=/.kube/kubeconfig \
-v "$KUBE_DIR":/.kube \
-p 6443:6443 \
-p 30000:30000 \
-p 30001:5432 \
-p 30002:30002 \
-p 30080:30080 \
-p 30081:30081 \
"$IMAGE"

echo "Waiting for kubeconfig (timeout ${READY_TIMEOUT}s)..."
deadline=$(( $(date +%s) + READY_TIMEOUT ))
until [ -s "$KUBE_DIR/kubeconfig" ]; do
if [ "$(date +%s)" -gt "$deadline" ]; then
echo "ERROR: kubeconfig not written within ${READY_TIMEOUT}s" >&2
docker logs "$NAME" >&2 || true
exit 1
fi
sleep 1
done
docker exec "$NAME" chown "$(id -u):$(id -g)" /.kube/kubeconfig

KUBECONFIG="$KUBE_DIR/kubeconfig"
export KUBECONFIG
if [ -n "${GITHUB_ENV:-}" ]; then
echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV"
fi

echo "Waiting for flyte namespace..."
until kubectl get ns flyte >/dev/null 2>&1; do
if [ "$(date +%s)" -gt "$deadline" ]; then
echo "ERROR: flyte namespace not created within ${READY_TIMEOUT}s" >&2
kubectl get ns >&2 || true
exit 1
fi
sleep 2
done

echo "Waiting for flyte-binary deployment to exist..."
until kubectl get deploy -n flyte flyte-binary >/dev/null 2>&1; do
if [ "$(date +%s)" -gt "$deadline" ]; then
echo "ERROR: flyte-binary deployment not created within ${READY_TIMEOUT}s" >&2
kubectl get all -A >&2 || true
exit 1
fi
sleep 2
done

remaining=$(( deadline - $(date +%s) ))
[ "$remaining" -lt 30 ] && remaining=30
echo "Waiting for flyte-binary rollout (timeout ${remaining}s)..."
kubectl rollout status deploy/flyte-binary -n flyte --timeout="${remaining}s"

# Bridge rustfs.flyte:9000 -> localhost:30002 (the rustfs NodePort).
# DataProxy mints signed URLs whose host is the in-cluster storage endpoint
# (http://rustfs.flyte:9000), which is unreachable from the runner. We add a
# /etc/hosts entry and a TCP forwarder so the SDK's PUT to the signed URL
# resolves to the published NodePort and lands on the rustfs pod.
if ! grep -q '[[:space:]]rustfs\.flyte\b' /etc/hosts; then
echo "127.0.0.1 rustfs.flyte" | sudo tee -a /etc/hosts >/dev/null
fi
if ! command -v socat >/dev/null 2>&1; then
sudo apt-get update -qq && sudo apt-get install -y -qq socat
fi
nohup socat TCP-LISTEN:9000,reuseaddr,fork TCP:127.0.0.1:30002 \
>/tmp/rustfs-forward.log 2>&1 &
disown
forward_deadline=$(( $(date +%s) + 15 ))
until nc -z 127.0.0.1 9000 2>/dev/null; do
if [ "$(date +%s)" -gt "$forward_deadline" ]; then
echo "ERROR: rustfs.flyte:9000 forward did not open" >&2
cat /tmp/rustfs-forward.log >&2 || true
exit 1
fi
sleep 0.3
done

# Port-forward directly to the flyte-binary ClusterIP service.
# The bundled Traefik on NodePort 30080 doesn't reliably do h2c, so the
# Python SDK's gRPC client (HTTP/2 cleartext) fails through it. Talking
# directly to svc/flyte-binary:8090 sidesteps the proxy entirely.
nohup kubectl port-forward -n flyte svc/flyte-binary-http 8090:8090 \
--address 127.0.0.1 \
>/tmp/flyte-binary-pf.log 2>&1 &
disown
pf_deadline=$(( $(date +%s) + 15 ))
until nc -z 127.0.0.1 8090 2>/dev/null; do
if [ "$(date +%s)" -gt "$pf_deadline" ]; then
echo "ERROR: flyte-binary port-forward did not open" >&2
cat /tmp/flyte-binary-pf.log >&2 || true
exit 1
fi
sleep 0.3
done

echo "Devbox ready."
echo " flyte-binary (direct): http://localhost:8090"
echo " flyte-binary (Traefik): http://localhost:30080"
echo " rustfs S3: http://localhost:30002 (also rustfs.flyte:9000)"
echo " Connect API: http://localhost:30080"
echo " rustfs S3: http://localhost:30002"
echo " Postgres: localhost:30001"
Loading