Skip to content

Commit abc7702

Browse files
committed
feat: add AI workload policies for GPU governance
Add 5 new policies for AI/ML workload governance on Kubernetes: - k8sgpuresourcelimits: Enforce max GPU count per container - k8srequiredgputoleration: Require GPU pods to tolerate GPU node taints - k8sgpuactivedeadline: Require GPU pods to set activeDeadlineSeconds - k8sgpusharedmemory: Require GPU containers to mount memory-backed /dev/shm - k8srequiredgpuruntimeclass: Require GPU pods to use an allowed runtimeClassName Each policy includes: - Dual-engine implementation (Rego + CEL/K8sNativeValidation) - OPA unit tests (21 tests total, all passing) - Gator integration tests (suite.yaml with sample constraints and resources) - exemptImages parameter support Also adds an 'ai-workload' bundle to catalog.yaml that groups these policies with the existing k8snounsupportedgpu policy. Signed-off-by: Jaydip Gabani <gabanijaydip@gmail.com>
1 parent 978bd88 commit abc7702

130 files changed

Lines changed: 5392 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
version: 1.0.0
2+
name: k8sgpuactivedeadline
3+
displayName: GPU Active Deadline Required
4+
createdAt: "2026-03-17T00:04:28Z"
5+
description: Requires pods that request NVIDIA GPU resources (nvidia.com/gpu) to set activeDeadlineSeconds. This prevents runaway training jobs from holding GPU resources indefinitely.
6+
digest: 7edfe0c97c446e801bf2e6109ef3adbdac6c603b540cd9f76eef66e46b6082b8
7+
license: Apache-2.0
8+
homeURL: https://open-policy-agent.github.io/gatekeeper-library/website/gpuactivedeadline
9+
keywords:
10+
- gatekeeper
11+
- open-policy-agent
12+
- policies
13+
readme: |-
14+
# GPU Active Deadline Required
15+
Requires pods that request NVIDIA GPU resources (nvidia.com/gpu) to set activeDeadlineSeconds. This prevents runaway training jobs from holding GPU resources indefinitely.
16+
install: |-
17+
### Usage
18+
```shell
19+
kubectl apply -f https://raw.githubusercontent.com/open-policy-agent/gatekeeper-library/master/artifacthub/library/general/gpuactivedeadline/1.0.0/template.yaml
20+
```
21+
provider:
22+
name: Gatekeeper Library
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
resources:
2+
- template.yaml
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: constraints.gatekeeper.sh/v1beta1
2+
kind: K8sGpuActiveDeadline
3+
metadata:
4+
name: require-gpu-deadline
5+
spec:
6+
match:
7+
kinds:
8+
- apiGroups: [""]
9+
kinds: ["Pod"]
10+
parameters:
11+
maxActiveDeadlineSeconds: 86400
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: gpu-job-with-deadline
5+
spec:
6+
activeDeadlineSeconds: 3600
7+
containers:
8+
- name: training
9+
image: nvidia/cuda:12.0-runtime
10+
resources:
11+
limits:
12+
nvidia.com/gpu: "1"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: constraints.gatekeeper.sh/v1beta1
2+
kind: K8sGpuActiveDeadline
3+
metadata:
4+
name: require-gpu-deadline
5+
spec:
6+
match:
7+
kinds:
8+
- apiGroups: [""]
9+
kinds: ["Pod"]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: gpu-job-without-deadline
5+
spec:
6+
containers:
7+
- name: training
8+
image: nvidia/cuda:12.0-runtime
9+
resources:
10+
limits:
11+
nvidia.com/gpu: "1"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: constraints.gatekeeper.sh/v1beta1
2+
kind: K8sGpuActiveDeadline
3+
metadata:
4+
name: require-gpu-deadline
5+
spec:
6+
match:
7+
kinds:
8+
- apiGroups: [""]
9+
kinds: ["Pod"]
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: non-gpu-job
5+
spec:
6+
containers:
7+
- name: web
8+
image: nginx:1.25
9+
resources:
10+
limits:
11+
cpu: "500m"
12+
memory: "128Mi"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
kind: Suite
2+
apiVersion: test.gatekeeper.sh/v1alpha1
3+
metadata:
4+
name: gpuactivedeadline
5+
tests:
6+
- name: gpu-job-with-deadline
7+
template: template.yaml
8+
constraint: samples/gpu-job-with-deadline/constraint.yaml
9+
cases:
10+
- name: example-allowed
11+
object: samples/gpu-job-with-deadline/example_allowed.yaml
12+
assertions:
13+
- violations: no
14+
- name: gpu-job-without-deadline
15+
template: template.yaml
16+
constraint: samples/gpu-job-without-deadline/constraint.yaml
17+
cases:
18+
- name: example-disallowed
19+
object: samples/gpu-job-without-deadline/example_disallowed.yaml
20+
assertions:
21+
- violations: yes
22+
- name: non-gpu-job
23+
template: template.yaml
24+
constraint: samples/non-gpu-job/constraint.yaml
25+
cases:
26+
- name: example-allowed
27+
object: samples/non-gpu-job/example_allowed.yaml
28+
assertions:
29+
- violations: no
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
apiVersion: templates.gatekeeper.sh/v1
2+
kind: ConstraintTemplate
3+
metadata:
4+
name: k8sgpuactivedeadline
5+
annotations:
6+
metadata.gatekeeper.sh/title: "GPU Active Deadline Required"
7+
metadata.gatekeeper.sh/version: 1.0.0
8+
description: >-
9+
Requires pods that request NVIDIA GPU resources (nvidia.com/gpu) to set
10+
activeDeadlineSeconds. This prevents runaway training jobs from holding
11+
GPU resources indefinitely.
12+
spec:
13+
crd:
14+
spec:
15+
names:
16+
kind: K8sGpuActiveDeadline
17+
validation:
18+
openAPIV3Schema:
19+
type: object
20+
description: >-
21+
Requires GPU pods to set activeDeadlineSeconds.
22+
properties:
23+
maxActiveDeadlineSeconds:
24+
description: >-
25+
The maximum value allowed for activeDeadlineSeconds. Set to 0 to
26+
only require the field is present without enforcing a maximum.
27+
type: integer
28+
exemptImages:
29+
description: >-
30+
Any container that uses an image that matches an entry in this list will be excluded
31+
from enforcement. Prefix-matching can be signified with `*`.
32+
type: array
33+
items:
34+
type: string
35+
targets:
36+
- target: admission.k8s.gatekeeper.sh
37+
code:
38+
- engine: K8sNativeValidation
39+
source:
40+
variables:
41+
- name: containers
42+
expression: 'has(variables.anyObject.spec.containers) ? variables.anyObject.spec.containers : []'
43+
- name: initContainers
44+
expression: 'has(variables.anyObject.spec.initContainers) ? variables.anyObject.spec.initContainers : []'
45+
- name: ephemeralContainers
46+
expression: 'has(variables.anyObject.spec.ephemeralContainers) ? variables.anyObject.spec.ephemeralContainers : []'
47+
- name: exemptImagePrefixes
48+
expression: |
49+
!has(variables.params.exemptImages) ? [] :
50+
variables.params.exemptImages.filter(image, image.endsWith("*")).map(image, string(image).replace("*", ""))
51+
- name: exemptImageExplicit
52+
expression: |
53+
!has(variables.params.exemptImages) ? [] :
54+
variables.params.exemptImages.filter(image, !image.endsWith("*"))
55+
- name: exemptImages
56+
expression: |
57+
(variables.containers + variables.initContainers + variables.ephemeralContainers).filter(container,
58+
container.image in variables.exemptImageExplicit ||
59+
variables.exemptImagePrefixes.exists(exemption, string(container.image).startsWith(exemption))
60+
).map(container, container.image)
61+
- name: podRequestsGpu
62+
expression: |
63+
(variables.containers + variables.initContainers + variables.ephemeralContainers).exists(container,
64+
!(container.image in variables.exemptImages) &&
65+
has(container.resources) &&
66+
has(container.resources.limits) &&
67+
"nvidia.com/gpu" in container.resources.limits &&
68+
quantity(string(container.resources.limits["nvidia.com/gpu"])).compareTo(quantity("0")) > 0
69+
)
70+
- name: hasDeadline
71+
expression: 'has(variables.anyObject.spec.activeDeadlineSeconds)'
72+
- name: maxDeadline
73+
expression: 'has(variables.params.maxActiveDeadlineSeconds) ? variables.params.maxActiveDeadlineSeconds : 0'
74+
validations:
75+
- expression: '!variables.podRequestsGpu || variables.hasDeadline'
76+
messageExpression: '"Pod <" + variables.anyObject.metadata.name + "> requests GPU resources but does not set activeDeadlineSeconds"'
77+
- expression: '!variables.podRequestsGpu || !variables.hasDeadline || variables.maxDeadline == 0 || variables.anyObject.spec.activeDeadlineSeconds <= variables.maxDeadline'
78+
messageExpression: '"Pod <" + variables.anyObject.metadata.name + "> sets activeDeadlineSeconds to " + string(variables.anyObject.spec.activeDeadlineSeconds) + ", which exceeds the maximum allowed " + string(variables.maxDeadline)'
79+
- engine: Rego
80+
source:
81+
rego: |
82+
package k8sgpuactivedeadline
83+
84+
violation[{"msg": msg}] {
85+
pod_requests_gpu
86+
not has_active_deadline
87+
msg := sprintf("Pod <%v> requests GPU resources but does not set activeDeadlineSeconds", [input.review.object.metadata.name])
88+
}
89+
90+
violation[{"msg": msg}] {
91+
pod_requests_gpu
92+
has_active_deadline
93+
max_deadline := object.get(input, ["parameters", "maxActiveDeadlineSeconds"], 0)
94+
max_deadline > 0
95+
deadline := input.review.object.spec.activeDeadlineSeconds
96+
deadline > max_deadline
97+
msg := sprintf("Pod <%v> sets activeDeadlineSeconds to %v, which exceeds the maximum allowed %v", [input.review.object.metadata.name, deadline, max_deadline])
98+
}
99+
100+
pod_requests_gpu {
101+
container := input_containers[_]
102+
not is_exempt(container)
103+
gpu := container.resources.limits["nvidia.com/gpu"]
104+
to_number(gpu) > 0
105+
}
106+
107+
has_active_deadline {
108+
input.review.object.spec.activeDeadlineSeconds
109+
}
110+
111+
input_containers[c] {
112+
c := input.review.object.spec.containers[_]
113+
}
114+
115+
input_containers[c] {
116+
c := input.review.object.spec.initContainers[_]
117+
}
118+
119+
input_containers[c] {
120+
c := input.review.object.spec.ephemeralContainers[_]
121+
}
122+
123+
is_exempt(container) {
124+
exempt_images := object.get(input, ["parameters", "exemptImages"], [])
125+
img := container.image
126+
exemption := exempt_images[_]
127+
_matches_exemption(img, exemption)
128+
}
129+
130+
_matches_exemption(img, exemption) {
131+
not endswith(exemption, "*")
132+
exemption == img
133+
}
134+
135+
_matches_exemption(img, exemption) {
136+
endswith(exemption, "*")
137+
prefix := trim_suffix(exemption, "*")
138+
startswith(img, prefix)
139+
}

0 commit comments

Comments
 (0)