Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions examples/external-podgroup/multi-topology-jobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2026 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

---
apiVersion: kai.scheduler/v1alpha1
kind: Topology
metadata:
name: rack-topology
spec:
levels:
- nodeLabel: topology.test/rack-domain
- nodeLabel: kubernetes.io/hostname

---
apiVersion: kai.scheduler/v1alpha1
kind: Topology
metadata:
name: zone-topology
spec:
levels:
- nodeLabel: topology.test/zone-domain
- nodeLabel: kubernetes.io/hostname

---
apiVersion: scheduling.run.ai/v2alpha2
kind: PodGroup
metadata:
name: multi-topology-pipeline
namespace: default
labels:
kai.scheduler/queue: default-queue
spec:
queue: default-queue
minSubGroup: 2
subGroups:
- name: rack-workers
minMember: 3
topologyConstraint:
topology: rack-topology
requiredTopologyLevel: topology.test/rack-domain
- name: zone-workers
minMember: 3
topologyConstraint:
topology: zone-topology
requiredTopologyLevel: topology.test/zone-domain

---
apiVersion: batch/v1
kind: Job
metadata:
name: rack-bound-job
namespace: default
annotations:
kai.scheduler/skip-podgrouper: "true"
spec:
parallelism: 3
completions: 3
template:
metadata:
annotations:
pod-group-name: multi-topology-pipeline
labels:
kai.scheduler/queue: default-queue
kai.scheduler/subgroup-name: rack-workers
spec:
schedulerName: kai-scheduler
restartPolicy: Never
containers:
- name: worker
image: busybox:1.36
command: ["sh", "-c", "sleep 3600"]
resources:
requests:
cpu: "250m"
nvidia.com/gpu: "4"
limits:
cpu: "250m"
nvidia.com/gpu: "4"

---
apiVersion: batch/v1
kind: Job
metadata:
name: zone-bound-job
namespace: default
annotations:
kai.scheduler/skip-podgrouper: "true"
spec:
parallelism: 3
completions: 3
template:
metadata:
annotations:
pod-group-name: multi-topology-pipeline
labels:
kai.scheduler/queue: default-queue
kai.scheduler/subgroup-name: zone-workers
spec:
schedulerName: kai-scheduler
restartPolicy: Never
containers:
- name: worker
image: busybox:1.36
command: ["sh", "-c", "sleep 3600"]
resources:
requests:
cpu: "250m"
nvidia.com/gpu: "4"
limits:
cpu: "250m"
nvidia.com/gpu: "4"
49 changes: 49 additions & 0 deletions hack/e2e-kind-config-multi-topology.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2026 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-a
topology.test/zone-domain: zone-x
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-a
topology.test/zone-domain: zone-y
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-b
topology.test/zone-domain: zone-x
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-b
topology.test/zone-domain: zone-y
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-c
topology.test/zone-domain: zone-x
- role: worker
labels:
run.ai/simulated-gpu-node-pool: default
nvidia.com/gpu.deploy.device-plugin: "true"
nvidia.com/gpu.memory: "11441"
topology.test/rack-domain: rack-c
topology.test/zone-domain: zone-y
47 changes: 36 additions & 11 deletions hack/setup-e2e-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,24 @@ CLUSTER_NAME=${CLUSTER_NAME:-e2e-kai-scheduler}

REPO_ROOT=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..
: ${FEATURE_CONFIG:="default"}
KIND_CONFIG=${KIND_CONFIG:-""}
GENERATED_KIND_CONFIG=""
PORT_FORWARD_PID=""

: ${KIND_K8S_TAG:="v1.35.0"}
: ${KIND_IMAGE:="kindest/node:${KIND_K8S_TAG}"}

cleanup() {
if [[ -n "$PORT_FORWARD_PID" ]]; then
kill "$PORT_FORWARD_PID" 2>/dev/null || true
fi
if [[ -n "$GENERATED_KIND_CONFIG" ]]; then
rm -f "$GENERATED_KIND_CONFIG"
fi
}

trap cleanup EXIT

# Parse named parameters
TEST_THIRD_PARTY_INTEGRATIONS=${TEST_THIRD_PARTY_INTEGRATIONS:-"false"}
LOCAL_IMAGES_BUILD=${LOCAL_IMAGES_BUILD:-"false"}
Expand All @@ -38,12 +52,17 @@ while [[ $# -gt 0 ]]; do
FEATURE_CONFIG="$2"
shift 2
;;
--kind-config)
KIND_CONFIG="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [--test-third-party-integrations] [--local-images-build] [--install-vpa] [--feature-config <config>]"
echo "Usage: $0 [--test-third-party-integrations] [--local-images-build] [--install-vpa] [--feature-config <config>] [--kind-config <path>]"
echo " --test-third-party-integrations: Install third party operators for compatibility testing"
echo " --local-images-build: Build and use local images instead of pulling from registry"
echo " --install-vpa: Install Vertical Pod Autoscaler and metrics-server"
echo " --feature-config: Feature configuration for kind cluster generation (default: \"default\")"
echo " --kind-config: Existing kind config file to use instead of generating one"
exit 0
;;
*)
Expand All @@ -54,20 +73,27 @@ while [[ $# -gt 0 ]]; do
esac
done

GENERATED_KIND_CONFIG=$(mktemp "${TMPDIR:-/tmp}/kind-config-XXXXXX.yaml")
trap "rm -f \"$GENERATED_KIND_CONFIG\"" EXIT
${REPO_ROOT}/hack/generate-kind-config.sh \
--feature-config "$FEATURE_CONFIG" \
--k8s-version "$KIND_K8S_TAG" \
--output "$GENERATED_KIND_CONFIG"
if [[ -n "$KIND_CONFIG" && "$FEATURE_CONFIG" != "default" ]]; then
echo "--feature-config cannot be used together with --kind-config"
Comment thread
davidLif marked this conversation as resolved.
exit 1
fi

if [[ -n "$KIND_CONFIG" ]]; then
CLUSTER_KIND_CONFIG="$KIND_CONFIG"
else
GENERATED_KIND_CONFIG=$(mktemp "${TMPDIR:-/tmp}/kind-config-XXXXXX.yaml")
${REPO_ROOT}/hack/generate-kind-config.sh \
--feature-config "$FEATURE_CONFIG" \
--k8s-version "$KIND_K8S_TAG" \
--output "$GENERATED_KIND_CONFIG"
CLUSTER_KIND_CONFIG="$GENERATED_KIND_CONFIG"
fi

kind create cluster \
--config "$GENERATED_KIND_CONFIG" \
--config "$CLUSTER_KIND_CONFIG" \
--image "${KIND_IMAGE}" \
--name "$CLUSTER_NAME"

rm -f "$GENERATED_KIND_CONFIG"
Comment thread
davidLif marked this conversation as resolved.

# Deploy local image registry
echo "Deploying local image registry..."
kubectl apply -f ${REPO_ROOT}/hack/local_registry.yaml
Expand Down Expand Up @@ -140,7 +166,6 @@ if [ "$LOCAL_IMAGES_BUILD" = "true" ]; then
# Start port-forward to local registry
kubectl port-forward -n kube-registry deploy/registry 30100:5000 &
PORT_FORWARD_PID=$!
trap "kill $PORT_FORWARD_PID 2>/dev/null || true" EXIT
sleep 2

# Probe whether docker push can reach the registry (fails on Docker Desktop where the
Expand Down
Loading
Loading