diff --git a/.gitignore b/.gitignore
index c16aa5fd..bbc721c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,8 @@ __pycache__/
 *.tgz
 .claude/
 
+# Extracted subchart artifacts from helm dep update
+/kube-prometheus-stack/
+/kube-state-metrics/
+/metrics-server/
+
diff --git a/charts/dataplane/templates/_helpers.tpl b/charts/dataplane/templates/_helpers.tpl
index b1f9d611..f0c777e9 100644
--- a/charts/dataplane/templates/_helpers.tpl
+++ b/charts/dataplane/templates/_helpers.tpl
@@ -910,6 +910,201 @@ nodeName: {{- toYaml . }}
 {{- end }}
 {{- end -}}
 
+{{/*
+Prometheus scheduling helpers
+*/}}
+{{- define "prometheus.scheduling.topologySpreadConstraints" -}}
+{{- with .Values.prometheus.topologySpreadConstraints }}
+topologySpreadConstraints:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "prometheus.scheduling.affinity" -}}
+{{- with .Values.prometheus.affinity }}
+affinity:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "prometheus.scheduling.nodeSelector" -}}
+{{- with .Values.prometheus.nodeSelector }}
+nodeSelector:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "prometheus.scheduling.nodeName" -}}
+{{- with .Values.prometheus.nodeName }}
+nodeName: {{ toYaml . }}
+{{- end }}
+{{- end }}
+
+{{- define "prometheus.scheduling.tolerations" -}}
+{{- with .Values.prometheus.tolerations }}
+tolerations:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "prometheus.scheduling" -}}
+{{- if .Values.prometheus.topologySpreadConstraints }}
+{{- include "prometheus.scheduling.topologySpreadConstraints" . }}
+{{- else }}
+{{- include "global.scheduling.topologySpreadConstraints" . }}
+{{- end }}
+{{- if .Values.prometheus.affinity }}
+{{- include "prometheus.scheduling.affinity" . }}
+{{- else }}
+{{- include "global.scheduling.affinity" . }}
+{{- end }}
+{{- if .Values.prometheus.nodeSelector }}
+{{- include "prometheus.scheduling.nodeSelector" . }}
+{{- else }}
+{{- include "global.scheduling.nodeSelector" . }}
+{{- end }}
+{{- if .Values.prometheus.nodeName }}
+{{- include "prometheus.scheduling.nodeName" . }}
+{{- else }}
+{{- include "global.scheduling.nodeName" . }}
+{{- end }}
+{{- if .Values.prometheus.tolerations }}
+{{- include "prometheus.scheduling.tolerations" . }}
+{{- else }}
+{{- include "global.scheduling.tolerations" . }}
+{{- end }}
+{{- end -}}
+
+{{/*
+Flyteconnector scheduling helpers
+*/}}
+{{- define "flyteconnector.scheduling.topologySpreadConstraints" -}}
+{{- with .Values.flyteconnector.topologySpreadConstraints }}
+topologySpreadConstraints:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "flyteconnector.scheduling.affinity" -}}
+{{- with .Values.flyteconnector.affinity }}
+affinity:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "flyteconnector.scheduling.nodeSelector" -}}
+{{- with .Values.flyteconnector.nodeSelector }}
+nodeSelector:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "flyteconnector.scheduling.nodeName" -}}
+{{- with .Values.flyteconnector.nodeName }}
+nodeName: {{ toYaml . }}
+{{- end }}
+{{- end }}
+
+{{- define "flyteconnector.scheduling.tolerations" -}}
+{{- with .Values.flyteconnector.tolerations }}
+tolerations:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "flyteconnector.scheduling" -}}
+{{- if .Values.flyteconnector.topologySpreadConstraints }}
+{{- include "flyteconnector.scheduling.topologySpreadConstraints" . }}
+{{- else }}
+{{- include "global.scheduling.topologySpreadConstraints" . }}
+{{- end }}
+{{- if .Values.flyteconnector.affinity }}
+{{- include "flyteconnector.scheduling.affinity" . }}
+{{- else }}
+{{- include "global.scheduling.affinity" . }}
+{{- end }}
+{{- if .Values.flyteconnector.nodeSelector }}
+{{- include "flyteconnector.scheduling.nodeSelector" . }}
+{{- else }}
+{{- include "global.scheduling.nodeSelector" . }}
+{{- end }}
+{{- if .Values.flyteconnector.nodeName }}
+{{- include "flyteconnector.scheduling.nodeName" . }}
+{{- else }}
+{{- include "global.scheduling.nodeName" . }}
+{{- end }}
+{{- if .Values.flyteconnector.tolerations }}
+{{- include "flyteconnector.scheduling.tolerations" . }}
+{{- else }}
+{{- include "global.scheduling.tolerations" . }}
+{{- end }}
+{{- end -}}
+
+{{/*
+Imagebuilder buildkit scheduling helpers
+*/}}
+{{- define "imagebuilder.buildkit.scheduling.topologySpreadConstraints" -}}
+{{- with .Values.imageBuilder.buildkit.topologySpreadConstraints }}
+topologySpreadConstraints:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "imagebuilder.buildkit.scheduling.affinity" -}}
+{{- with .Values.imageBuilder.buildkit.affinity }}
+affinity:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "imagebuilder.buildkit.scheduling.nodeSelector" -}}
+{{- with .Values.imageBuilder.buildkit.nodeSelector }}
+nodeSelector:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "imagebuilder.buildkit.scheduling.nodeName" -}}
+{{- with .Values.imageBuilder.buildkit.nodeName }}
+nodeName: {{ toYaml . }}
+{{- end }}
+{{- end }}
+
+{{- define "imagebuilder.buildkit.scheduling.tolerations" -}}
+{{- with .Values.imageBuilder.buildkit.tolerations }}
+tolerations:
+{{ toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
+
+{{- define "imagebuilder.buildkit.scheduling" -}}
+{{- if .Values.imageBuilder.buildkit.topologySpreadConstraints }}
+{{- include "imagebuilder.buildkit.scheduling.topologySpreadConstraints" . }}
+{{- else }}
+{{- include "global.scheduling.topologySpreadConstraints" . }}
+{{- end }}
+{{- if .Values.imageBuilder.buildkit.affinity }}
+{{- include "imagebuilder.buildkit.scheduling.affinity" . }}
+{{- else }}
+{{- include "global.scheduling.affinity" . }}
+{{- end }}
+{{- if .Values.imageBuilder.buildkit.nodeSelector }}
+{{- include "imagebuilder.buildkit.scheduling.nodeSelector" . }}
+{{- else }}
+{{- include "global.scheduling.nodeSelector" . }}
+{{- end }}
+{{- if .Values.imageBuilder.buildkit.nodeName }}
+{{- include "imagebuilder.buildkit.scheduling.nodeName" . }}
+{{- else }}
+{{- include "global.scheduling.nodeName" . }}
+{{- end }}
+{{- if .Values.imageBuilder.buildkit.tolerations }}
+{{- include "imagebuilder.buildkit.scheduling.tolerations" . }}
+{{- else }}
+{{- include "global.scheduling.tolerations" . }}
+{{- end }}
+{{- end -}}
+
 {{/*
 Global service account annotations
 */}}
diff --git a/charts/dataplane/templates/flyteconnector/deployment.yaml b/charts/dataplane/templates/flyteconnector/deployment.yaml
index a6e60737..0dc9a3cd 100644
--- a/charts/dataplane/templates/flyteconnector/deployment.yaml
+++ b/charts/dataplane/templates/flyteconnector/deployment.yaml
@@ -62,13 +62,5 @@ spec:
       {{- with .Values.flyteconnector.additionalVolumes -}}
       {{ tpl (toYaml .) $ | nindent 6 }}
       {{- end }}
-      {{- with .Values.flyteconnector.nodeSelector }}
-      nodeSelector: {{ tpl (toYaml .) $ | nindent 8 }}
-      {{- end }}
-      {{- with .Values.flyteconnector.affinity }}
-      affinity: {{ tpl (toYaml .) $ | nindent 8 }}
-      {{- end }}
-      {{- with .Values.flyteconnector.tolerations }}
-      tolerations: {{ tpl (toYaml .) $ | nindent 8 }}
-      {{- end }}
+      {{- include "flyteconnector.scheduling" . | nindent 6 }}
 {{- end }}
diff --git a/charts/dataplane/templates/imagebuilder/deployment.yaml b/charts/dataplane/templates/imagebuilder/deployment.yaml
index cdee1452..45370550 100644
--- a/charts/dataplane/templates/imagebuilder/deployment.yaml
+++ b/charts/dataplane/templates/imagebuilder/deployment.yaml
@@ -100,9 +100,10 @@ spec:
       {{- with .Values.imageBuilder.buildkit.additionalVolumes -}}
       {{ tpl (toYaml .) $ | nindent 6 }}
       {{- end }}
-      {{- with .Values.imageBuilder.buildkit.nodeSelector }}
-      nodeSelector:
-        {{- toYaml . | nindent 8 }}
+      {{- if .Values.imageBuilder.buildkit.nodeSelector }}
+      {{- include "imagebuilder.buildkit.scheduling.nodeSelector" . | nindent 6 }}
+      {{- else if .Values.scheduling.nodeSelector }}
+      {{- include "global.scheduling.nodeSelector" . | nindent 6 }}
       {{- end }}
       affinity:
         podAntiAffinity:
@@ -111,8 +112,9 @@ spec:
               matchLabels:
                 {{- include "imagebuilder.buildkit.selectorLabels" . | nindent 16 }}
             topologyKey: "kubernetes.io/hostname"
-      {{- with .Values.imageBuilder.buildkit.tolerations }}
-      tolerations:
-        {{- toYaml . | nindent 8 }}
+      {{- if .Values.imageBuilder.buildkit.tolerations }}
+      {{- include "imagebuilder.buildkit.scheduling.tolerations" . | nindent 6 }}
+      {{- else if .Values.scheduling.tolerations }}
+      {{- include "global.scheduling.tolerations" . | nindent 6 }}
       {{- end }}
 {{- end }}
\ No newline at end of file
diff --git a/charts/dataplane/templates/prometheus/deployment.yaml b/charts/dataplane/templates/prometheus/deployment.yaml
index ab7459e1..5ce213fd 100644
--- a/charts/dataplane/templates/prometheus/deployment.yaml
+++ b/charts/dataplane/templates/prometheus/deployment.yaml
@@ -55,15 +55,4 @@ spec:
         - name: prometheus-config
           configMap:
             name: {{ include "union-operator.fullname" . }}-prometheus
-      {{- with .Values.prometheus.nodeSelector }}
-      nodeSelector:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.prometheus.affinity }}
-      affinity:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.prometheus.tolerations }}
-      tolerations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
+      {{- include "prometheus.scheduling" . | nindent 6 }}
diff --git a/charts/dataplane/values.yaml b/charts/dataplane/values.yaml
index 97dc1707..c1829e74 100644
--- a/charts/dataplane/values.yaml
+++ b/charts/dataplane/values.yaml
@@ -769,6 +769,12 @@ opencost:
         limits:
           cpu: 1000m
           memory: 4Gi
+    # -- Tolerations for opencost pods. Set to match scheduling.tolerations when using dedicated node pools.
+    tolerations: []
+    # -- Node selector for opencost pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+    nodeSelector: {}
+    # -- Affinity rules for opencost pods.
+    affinity: {}
 
 # -- Configuration for fluentbit used for the persistent logging feature.
 # FluentBit runs as a DaemonSet and ships container logs to the persisted-logs/
@@ -1164,6 +1170,10 @@ image:
 
 metrics-server:
   enabled: false
+  # -- Tolerations for metrics-server pods. Set to match scheduling.tolerations when using dedicated node pools.
+  tolerations: []
+  # -- Node selector for metrics-server pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+  nodeSelector: {}
 
 # -- nodeobserver contains the configuration information for the node observer service.
 nodeobserver:
@@ -1334,7 +1344,11 @@ prometheus:
 
 # -- Standalone kube-state-metrics for Union features (cost tracking, pod resource metrics).
 # Metric filtering is handled in the Prometheus static scrape config.
-kube-state-metrics: {}
+kube-state-metrics:
+  # -- Tolerations for kube-state-metrics pods. Set to match scheduling.tolerations when using dedicated node pools.
+  tolerations: []
+  # -- Node selector for kube-state-metrics pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+  nodeSelector: {}
 
 # -- Scopes the deployment, permissions and actions created into a single namespace
 low_privilege: false
@@ -1704,6 +1718,10 @@ monitoring:
 
   prometheusOperator:
     enabled: true
+    # -- Tolerations for prometheus-operator pods. Set to match scheduling.tolerations when using dedicated node pools.
+    tolerations: []
+    # -- Node selector for prometheus-operator pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+    nodeSelector: {}
 
   # CRDs should be installed separately via the dataplane-crds chart
   # (set crds.prometheusOperator: true) before enabling the monitoring stack.
@@ -1727,6 +1745,11 @@ monitoring:
     # Should override for production deployments
     adminPassword: admin
 
+    # -- Tolerations for grafana pods. Set to match scheduling.tolerations when using dedicated node pools.
+    tolerations: []
+    # -- Node selector for grafana pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+    nodeSelector: {}
+
   # Default monitoring stack for all relevant K8s components that impact
   # Union performance and reliability.
   coreDns:
@@ -1753,6 +1776,10 @@ monitoring:
   kube-state-metrics:
     nameOverride: "monitoring-kube-state-metrics"
     fullnameOverride: "monitoring-kube-state-metrics"
+    # -- Tolerations for monitoring kube-state-metrics pods. Set to match scheduling.tolerations when using dedicated node pools.
+    tolerations: []
+    # -- Node selector for monitoring kube-state-metrics pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+    nodeSelector: {}
 
   # By default, install a separate Prometheus instance for monitoring.
   # This is the simplest, out of the box model, it is highly recommended that users look
@@ -1782,3 +1809,8 @@ monitoring:
         requests:
           cpu: "500m"
           memory: "1Gi"
+
+      # -- Tolerations for monitoring prometheus pods. Set to match scheduling.tolerations when using dedicated node pools.
+      tolerations: []
+      # -- Node selector for monitoring prometheus pods. Set to match scheduling.nodeSelector when using dedicated node pools.
+      nodeSelector: {}
diff --git a/tests/generated/dataplane.global-scheduling.yaml b/tests/generated/dataplane.global-scheduling.yaml
new file mode 100644
index 00000000..164175a8
--- /dev/null
+++ b/tests/generated/dataplane.global-scheduling.yaml
@@ -0,0 +1,6401 @@
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-development
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-staging
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-production
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-development
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-staging
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-production
+---
+# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: fluentbit-system
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+automountServiceAccountToken: true
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+  namespace: union
+---
+# Source: dataplane/charts/opencost/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+automountServiceAccountToken: true
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-clustersync-system
+  namespace: union
+---
+# Source: dataplane/templates/flyteconnector/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/imagebuilder/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-imagebuilder
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/prometheus/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flytepropeller-webhook-system
+  namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flytepropeller-system
+  namespace: union
+---
+# Source: dataplane/templates/common/auth-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: union-secret-auth
+  namespace: union
+type: Opaque
+data:
+  # TODO(rob): update or configure operator to use client_secret like all the other components.
+  app_secret: dGVzdC1zZWNyZXQ=
+  client_secret: dGVzdC1zZWNyZXQ=
+---
+# Source: dataplane/templates/common/cluster-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: operator-cluster-name
+type: Opaque
+data:
+  cluster_name: dW5pb24tdGVzdA==
+---
+# Source: dataplane/templates/propeller/deployment-webhook.yaml
+# Create an empty secret that the first propeller pod will populate
+apiVersion: v1
+kind: Secret
+metadata:
+  name: flyte-pod-webhook
+  namespace: union
+type: Opaque
+---
+# Source: dataplane/templates/clusterresourcesync/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-clusterresourcesync-config
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  cluster_resources.yaml: | 
+    cluster_resources:
+      clusterName: 'union-test'
+      customData:
+      - production:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      - staging:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      - development:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      refreshInterval: 5m
+      standaloneDeployment: true
+      templatePath: /etc/flyte/clusterresource/templates
+    clusterResourcesPrivate:
+      app:
+        isServerless: false
+    union:
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+      connection:
+        host: dns:///union.test.union.ai
+  admin.yaml: | 
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    event:
+      capacity: 1000
+      rate: 500
+      type: admin
+  domain.yaml: | 
+    domains:
+    - id: development
+      name: development
+    - id: staging
+      name: staging
+    - id: production
+      name: production
+  clusters.yaml: |
+    clusters:
+      clusterConfigs: []
+      labelClusterMap: {}
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+---
+# Source: dataplane/templates/clusterresourcesync/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-clusterresource-template
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  a_namespace.yaml: | 
+    apiVersion: v1
+    kind: Namespace
+    metadata:
+      name: {{ namespace }}
+      labels:
+        union.ai/namespace-type: flyte
+    spec:
+      finalizers:
+      - kubernetes
+    
+  b_default_service_account.yaml: | 
+    apiVersion: v1
+    kind: ServiceAccount
+    metadata:
+      name: default
+      namespace: {{ namespace }}
+      annotations:
+        {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }}
+    
+  c_project_resource_quota.yaml: | 
+    apiVersion: v1
+    kind: ResourceQuota
+    metadata:
+      name: project-quota
+      namespace: {{ namespace }}
+    spec:
+      hard:
+        limits.cpu: {{ projectQuotaCpu }}
+        limits.memory: {{ projectQuotaMemory }}
+        requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }}
+---
+# Source: dataplane/templates/fluent-bit/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: fluentbit-system
+  namespace: union
+  labels:
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  custom_parsers.conf: |
+    [PARSER]
+        Name docker_no_time
+        Format json
+        Time_Keep Off
+        Time_Key time
+        Time_Format %Y-%m-%dT%H:%M:%S.%L
+  fluent-bit.conf: |
+    [SERVICE]
+        Parsers_File /fluent-bit/etc/parsers.conf
+        Parsers_File /fluent-bit/etc/conf/custom_parsers.conf
+        HTTP_Server On
+        HTTP_Listen 0.0.0.0
+        Health_Check On
+    [INPUT]
+        Name                tail
+        Tag                 namespace-<namespace_name>.pod-<pod_name>.cont-<container_name>
+        Tag_Regex           (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-
+        Path                /var/log/containers/*.log
+        DB                  /var/log/flb_kube.db
+        multiline.parser    docker, cri
+        Mem_Buf_Limit       5MB
+        Skip_Long_Lines     On
+        Refresh_Interval    10
+    
+    
+    [OUTPUT]
+        Name s3
+        Match *
+        upload_timeout 1m
+        s3_key_format /persisted-logs/$TAG
+        static_file_path true
+        json_date_key false
+        region us-east-1
+        bucket test-bucket
+---
+# Source: dataplane/templates/imagebuilder/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name : union-operator-buildkit
+data:
+  buildkitd.toml: |
+    debug = false
+
+    [log]
+      format = "text"
+
+    [worker.oci]
+      enabled = true
+      snapshotter = "auto"
+      gc = true
+      max-parallelism = 0
+
+      # Should not be used if Policies are defined
+      gckeepstorage = "10%"
+      [[worker.oci.gcpolicy]]
+        # Remove COPY/ADD and git checkout files
+        keepBytes = "10%"
+        keepDuration = "24h"
+        filters = [ "type==source.local", "type==source.git.checkout" ]
+      [[worker.oci.gcpolicy]]
+        # Remove locally cached image layers after it's unused for 24 hours
+        keepBytes = "10%"
+        keepDuration = "24h"
+        filters = [ "regular" ]
+      [[worker.oci.gcpolicy]]
+        # Remove shared cache mounts. E.G. Pip cache
+        keepBytes = "10%"
+        keepDuration = "72h"
+        filters = [ "type==exec.cachemount" ]
+      [[worker.oci.gcpolicy]]
+        # Remove everything else to keep the cache size under total file system limit
+        all = true
+        keepBytes = "80%"
+---
+# Source: dataplane/templates/monitoring/dashboard-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: release-name-dashboard-union-dataplane-overview
+  namespace: union
+  labels:
+    grafana_dashboard: "1"
+    app.kubernetes.io/managed-by: Helm
+data:
+  union-dataplane-overview.json: |-
+    {
+      "annotations": {
+        "list": []
+      },
+      "description": "Union Dataplane health and service metrics",
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 1,
+      "links": [],
+      "panels": [
+        {
+          "collapsed": false,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "id": 1,
+          "title": "Health",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.5
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 0,
+            "y": 1
+          },
+          "id": 2,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Service Availability",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "legendFormat": "Availability",
+              "refId": "A"
+            }
+          ],
+          "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 10
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 6,
+            "y": 1
+          },
+          "id": 3,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Pod Restarts (1h)",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))",
+              "legendFormat": "Restarts",
+              "refId": "A"
+            }
+          ],
+          "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 10
+                  },
+                  {
+                    "color": "red",
+                    "value": 50
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 18,
+            "y": 1
+          },
+          "id": 8,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "area",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Active Workflows",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Workflows",
+              "refId": "A"
+            }
+          ],
+          "description": "Current active FlyteWorkflow CRD count managed by Propeller."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 5
+          },
+          "id": 10,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max"
+              ],
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi",
+              "sort": "desc"
+            }
+          },
+          "title": "Active Executions",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Workflows",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Nodes",
+              "refId": "B"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Tasks",
+              "refId": "C"
+            }
+          ],
+          "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 5
+          },
+          "id": 7,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi",
+              "sort": "desc"
+            }
+          },
+          "title": "Queue Depth",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})",
+              "legendFormat": "Main",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})",
+              "legendFormat": "Sub",
+              "refId": "B"
+            }
+          ],
+          "description": "Main and sub workqueue depth over time."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 12,
+            "y": 1
+          },
+          "id": 11,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value_and_name"
+          },
+          "title": "Handler Panics",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})",
+              "legendFormat": "Total",
+              "refId": "A"
+            }
+          ],
+          "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling."
+        },
+        {
+          "collapsed": false,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 9
+          },
+          "id": 1200,
+          "title": "SLOs",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.99
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.999
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 3
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 10
+          },
+          "id": 1201,
+          "title": "Service Availability",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "refId": "A"
+            }
+          ],
+          "description": "Current service availability across all DP deployments."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": -999
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.5
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 1,
+              "noValue": "N/A"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 10
+          },
+          "id": 1202,
+          "title": "Error Budget Remaining",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "area",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "union:dp:slo:error_budget_remaining",
+              "refId": "A"
+            }
+          ],
+          "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 0
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.95
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.999
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 2,
+              "noValue": "N/A"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 10
+          },
+          "id": 1203,
+          "title": "Execution Success Rate",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)",
+              "refId": "A"
+            }
+          ],
+          "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              },
+              "unit": "s",
+              "decimals": 2
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 10
+          },
+          "id": 1204,
+          "title": "Propeller Latency p99",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)",
+              "refId": "A"
+            }
+          ],
+          "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "percentunit"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 16
+          },
+          "id": 1205,
+          "title": "Availability Over Time",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "legendFormat": "Availability",
+              "refId": "A"
+            },
+            {
+              "expr": "vector(0.999)",
+              "legendFormat": "Target (99.9%)",
+              "refId": "B"
+            }
+          ],
+          "description": "DP service availability over time with SLO target line."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "percentunit",
+              "max": 1,
+              "min": -0.5
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 16
+          },
+          "id": 1206,
+          "title": "Error Budget Burn Rate",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "union:dp:slo:error_budget_remaining",
+              "legendFormat": "Budget remaining",
+              "refId": "A"
+            },
+            {
+              "expr": "vector(0)",
+              "legendFormat": "Exhausted",
+              "refId": "B"
+            }
+          ],
+          "description": "DP error budget remaining over time. Requires monitoring.slos.enabled."
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 31
+          },
+          "id": 200,
+          "title": "Union Operator",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 10
+              },
+              "id": 201,
+              "title": "Work Queue Operations",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Processed",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Failed",
+                  "refId": "B"
+                }
+              ],
+              "description": "Operator execution operation processing rate and failure rate."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 10
+              },
+              "id": 202,
+              "title": "Background Process Runs / Errors",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Heartbeat runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Heartbeat errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Status runs",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Status errors",
+                  "refId": "D"
+                },
+                {
+                  "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Prom health errors",
+                  "refId": "E"
+                }
+              ],
+              "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 10
+              },
+              "id": 203,
+              "title": "Heartbeat Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Capabilities p90",
+                  "refId": "A"
+                },
+                {
+                  "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Usages p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "List WFs p90",
+                  "refId": "C"
+                }
+              ],
+              "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 18
+              },
+              "id": 204,
+              "title": "Config Syncer",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sync runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sync errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Propeller CM updated",
+                  "refId": "C"
+                }
+              ],
+              "description": "Config sync cycle rate, errors, and propeller ConfigMap update count."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 18
+              },
+              "id": 205,
+              "title": "Billable Usage Collector",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Errors",
+                  "refId": "B"
+                }
+              ],
+              "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "thresholds"
+                  },
+                  "thresholds": {
+                    "steps": [
+                      {
+                        "color": "green",
+                        "value": null
+                      },
+                      {
+                        "color": "red",
+                        "value": 1
+                      }
+                    ]
+                  },
+                  "unit": "bool_yes_no"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 18
+              },
+              "id": 206,
+              "title": "Work Queue Paused",
+              "type": "stat",
+              "options": {
+                "colorMode": "background",
+                "graphMode": "area",
+                "reduceOptions": {
+                  "calcs": [
+                    "lastNotNull"
+                  ]
+                },
+                "textMode": "auto"
+              },
+              "targets": [
+                {
+                  "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}",
+                  "legendFormat": "Paused",
+                  "refId": "A"
+                }
+              ],
+              "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 32
+          },
+          "id": 300,
+          "title": "Executor (V2)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 11
+              },
+              "id": 301,
+              "title": "Active Actions & Capacity",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "executor:active_actions_count{namespace=\"$namespace\"}",
+                  "legendFormat": "Active actions",
+                  "refId": "A"
+                },
+                {
+                  "expr": "executor:available_capacity{namespace=\"$namespace\"}",
+                  "legendFormat": "Available capacity",
+                  "refId": "B"
+                }
+              ],
+              "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 11
+              },
+              "id": 302,
+              "title": "Cache Discovery",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Miss",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Put success",
+                  "refId": "B"
+                }
+              ],
+              "description": "V2 executor cache discovery miss/put rates for task output caching."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 19
+              },
+              "id": 303,
+              "title": "Actions Terminated by Phase",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "{{ phase }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 19
+              },
+              "id": 304,
+              "title": "Evaluator Duration (pod creation)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "Evaluate p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Evaluate p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "Evaluate p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 19
+              },
+              "id": 305,
+              "title": "System Failures & Invalid Leases",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "System failures",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Exhausted retries",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Invalid leases",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Evaluate errors",
+                  "refId": "D"
+                }
+              ],
+              "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 33
+          },
+          "id": 100,
+          "title": "Flyte Propeller (V1)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 12
+              },
+              "id": 101,
+              "title": "Round Time (p50 / p90 / p99)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 12
+              },
+              "id": 102,
+              "title": "Round Success / Error Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Success",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Panics",
+                  "refId": "C"
+                }
+              ],
+              "description": "Propeller round outcomes: success, errors, and panics per second."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 12
+              },
+              "id": 103,
+              "title": "Free Workers",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})",
+                  "legendFormat": "Free workers",
+                  "refId": "A"
+                }
+              ],
+              "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 20
+              },
+              "id": 104,
+              "title": "Queue Add Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Main adds",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sub adds",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Main retries",
+                  "refId": "C"
+                }
+              ],
+              "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 20
+              },
+              "id": 105,
+              "title": "Workflow Updates",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Updated",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Failed",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Too large",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Conflict",
+                  "refId": "D"
+                }
+              ],
+              "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 20
+              },
+              "id": 106,
+              "title": "Workflow Update Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "etcd write latency for FlyteWorkflow status updates."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 28
+              },
+              "id": 107,
+              "title": "Node Queueing & Execution Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "Queue p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Queue p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000",
+                  "legendFormat": "Exec p90 (ms)",
+                  "refId": "C"
+                }
+              ],
+              "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "percentunit"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 28
+              },
+              "id": 108,
+              "title": "Metastore Cache Hit Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Hit rate",
+                  "refId": "A"
+                }
+              ],
+              "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 36
+              },
+              "id": 109,
+              "title": "Event Recording (DP \u2192 CP)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Task success",
+                  "refId": "A"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Node success",
+                  "refId": "B"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Task failure",
+                  "refId": "C"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Node failure",
+                  "refId": "D"
+                }
+              ],
+              "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 36
+              },
+              "id": 110,
+              "title": "Cache Discovery (hit/miss/skip)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Hits",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Misses",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Skips",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Get failures",
+                  "refId": "D"
+                }
+              ],
+              "description": "V2 executor cache discovery miss/put rates for task output caching."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 44
+              },
+              "id": 111,
+              "title": "K8s API Client Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "K8s requests/s",
+                  "refId": "A"
+                }
+              ],
+              "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "s"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 44
+              },
+              "id": 112,
+              "title": "K8s API Client Latency (p90)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "Request p90",
+                  "refId": "A"
+                },
+                {
+                  "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "Rate limiter p90",
+                  "refId": "B"
+                }
+              ],
+              "description": "K8s API request latency and client-side rate limiter wait time at p90."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 34
+          },
+          "id": 400,
+          "title": "gRPC Client (DP \u2192 CP)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 13
+              },
+              "id": 401,
+              "title": "gRPC Client Request Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "{{ grpc_service }}/{{ grpc_method }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 13
+              },
+              "id": 402,
+              "title": "gRPC Client Error Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))",
+                  "legendFormat": "{{ grpc_method }} {{ grpc_code }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "s"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 24,
+                "x": 0,
+                "y": 21
+              },
+              "id": 403,
+              "title": "gRPC Client Latency (p95)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "{{ grpc_method }} p95",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 35
+          },
+          "id": 1100,
+          "title": "Infrastructure",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 14
+              },
+              "id": 1101,
+              "title": "CPU Usage by Service",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))",
+                  "legendFormat": "{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "bytes"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 14
+              },
+              "id": 1102,
+              "title": "Memory Usage by Service",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})",
+                  "legendFormat": "{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Working set memory per container, stacked. Watch for approaching limits."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "bars",
+                    "fillOpacity": 80,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 24,
+                "x": 0,
+                "y": 22
+              },
+              "id": 1103,
+              "title": "Pod Restart Count by Container",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "{{ pod }}/{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Per-container restart events. Spikes indicate crashes or OOM kills."
+            }
+          ]
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": [
+        "union",
+        "dataplane"
+      ],
+      "templating": {
+        "list": [
+          {
+            "current": {},
+            "hide": 0,
+            "includeAll": false,
+            "label": "Data Source",
+            "multi": false,
+            "name": "datasource",
+            "options": [],
+            "query": "prometheus",
+            "refresh": 1,
+            "type": "datasource"
+          },
+          {
+            "current": {
+              "selected": true,
+              "text": "union",
+              "value": "union"
+            },
+            "hide": 2,
+            "label": "Namespace",
+            "name": "namespace",
+            "options": [
+              {
+                "selected": true,
+                "text": "union",
+                "value": "union"
+              }
+            ],
+            "query": "union",
+            "type": "constant"
+          }
+        ]
+      },
+      "time": {
+        "from": "now-3h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "browser",
+      "title": "Union Dataplane Overview",
+      "uid": "union-dp-overview",
+      "version": 1
+    }
+---
+# Source: dataplane/templates/nodeexecutor/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+data:
+  task_logs.yaml: | 
+    plugins:
+      logs:
+        cloudwatch-enabled: false
+        dynamic-log-links:
+        - vscode:
+            displayName: VS Code Debugger
+            linkType: ide
+            templateUris:
+            - /dataplane/pod/v1/generated_name/6060/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/union-test/{{.namespace}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/{{.generatedName}}/
+        - wandb-execution-id:
+            displayName: Weights & Biases
+            linkType: dashboard
+            templateUris:
+            - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project
+              }}/runs/{{ .podName }}'
+        - wandb-custom-id:
+            displayName: Weights & Biases
+            linkType: dashboard
+            templateUris:
+            - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project
+              }}/runs/{{ .taskConfig.id }}'
+        - comet-ml-execution-id:
+            displayName: Comet
+            linkType: dashboard
+            templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{
+              .taskConfig.project_name }}/{{ .executionName }}{{ .nodeId }}{{
+              .taskRetryAttempt }}{{ .taskConfig.link_suffix }}'
+        - comet-ml-custom-id:
+            displayName: Comet
+            linkType: dashboard
+            templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{
+              .taskConfig.project_name }}/{{ .taskConfig.experiment_key }}'
+        - neptune-scale-run:
+            displayName: Neptune Run
+            linkType: dashboard
+            templateUris:
+            - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{
+              .podName }}
+        - neptune-scale-custom-id:
+            displayName: Neptune Run
+            linkType: dashboard
+            templateUris:
+            - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{
+              .taskConfig.id }}
+        kubernetes-enabled: true
+  enabled_plugins.yaml: |
+    plugins:
+      connector-service:
+        defaultConnector:
+          defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}'
+          endpoint: k8s:///flyteconnector.union:8000
+    tasks:
+      task-plugins:
+        default-for-task-types:
+          actor: fast-task
+          container: container
+          container_array: k8s-array
+          sidecar: sidecar
+        enabled-plugins:
+        - container
+        - sidecar
+        - k8s-array
+        - echo
+        - fast-task
+        - connector-service
+  config.yaml: |
+    executor:
+      cluster: 'union-test'
+      evaluatorCount: 64
+      maxActions: 2000
+      organization: 'union'
+      unionAuth:
+        injectSecret: true
+        secretName: EAGER_API_KEY
+      workerName: worker1
+      task_resources:
+        defaults:
+          cpu: 100m
+          memory: 500Mi
+        limits:
+          cpu: 4096
+          gpu: 256
+          memory: 2Ti
+    union:
+      connection:
+        host: dns:///union.test.union.ai
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    authorizer:
+      type: noop
+    catalog-cache:
+      cache-endpoint: dns:///union.test.union.ai
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+      type: fallback
+      use-admin-auth: true
+    logger:
+      level: 4
+      show-source: true
+    sharedService:
+      metrics:
+        scope: 'executor:'
+      security:
+        allowCors: true
+        allowLocalhostAccess: true
+        allowedHeaders:
+        - Content-Type
+        allowedOrigins:
+        - '*'
+        secure: false
+        useAuth: false
+    propeller:
+      node-config:
+        disable-input-file-writes: true
+    plugins:
+      fasttask:
+        additional-worker-args:
+        - --last-ack-grace-period-seconds
+        - "120"
+        callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605
+        grace-period-status-not-found: 2m
+      ioutils:
+        remoteFileOutputPaths:
+          deckFilename: report.html
+      k8s:
+        disable-inject-owner-references: true
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+        co-pilot:
+          image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1'
+          name: flyte-copilot-
+          start-timeout: 30s
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+---
+# Source: dataplane/templates/operator/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-operator
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  k8s.yaml: | 
+    plugins:
+      k8s:
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+  config.yaml: |
+    union:
+      connection:
+        host: dns:///union.test.union.ai
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+    sharedService:
+      features:
+        gatewayV2: true
+      port: 8081
+    authorizer:
+      type: noop
+    operator:
+      enabled: true
+      enableTunnelService: true
+      tunnel:
+        enableDirectToAppIngress: false
+        deploymentToRestart: union-operator-proxy
+      apps:
+        enabled: 'false'
+      syncClusterConfig:
+        enabled: false
+      clusterId:
+        organization: 'union'
+      clusterData:
+        appId: 'test-client'
+        bucketName: 'test-bucket'
+        bucketRegion: 'us-east-1'
+        cloudHostName: 'union.test.union.ai'
+        gcpProjectId: ''
+        metadataBucketPrefix: 's3://test-bucket'
+        userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+        userRoleKey: 'eks.amazonaws.com/role-arn'
+      collectUsages:
+        enabled: true
+      billing:
+        model: Legacy
+      dependenciesHeartbeat:
+        prometheus:
+          endpoint: 'http://union-operator-prometheus:80/-/healthy'
+        propeller:
+          endpoint: 'http://flytepropeller:10254'
+        proxy:
+          endpoint: 'http://union-operator-proxy:10254'
+      imageBuilder:
+        enabled: true
+        executionNamespaceLabels:
+          union.ai/namespace-type: flyte
+        referenceConfigmapName: union-operator
+        targetConfigMapName: "build-image-config"
+    proxy:
+      imageBuilderConfig:
+        authenticationType: 'noop'
+        defaultRepository: ''
+      persistedLogs:
+        objectStore:
+          pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}}
+          prefix: persisted-logs
+        sourceType: ObjectStore
+      smConfig:
+        enabled: 'true'
+        k8sConfig:
+          namespace: 'union'
+        type: 'K8s'
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+  config-overrides.yaml: | 
+    cache:
+      identity:
+        enabled: false
+  storage.yaml: | 
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+  fast_registration_storage.yaml: | 
+    fastRegistrationStorage:
+      container: ""
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+  image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234"
+  image-builder.default-repository: ""
+  image-builder.authentication-type: "noop"
+---
+# Source: dataplane/templates/prometheus/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+    alerting:
+      alertmanagers:
+      - static_configs:
+        - targets:
+    rule_files:
+      - rules.yml
+    scrape_configs:
+      # Self-monitoring
+      - job_name: prometheus
+        metrics_path: /prometheus/metrics
+        static_configs:
+        - targets: ['localhost:9090']
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total
+          action: keep
+
+      # Kube state metrics for pod/node resource tracking and cost calculations
+      - job_name: kube-state-metrics
+        static_configs:
+        - targets: ['release-name-kube-state-metrics:8080']
+        metric_relabel_configs:
+        - separator: ;
+          source_labels: [__name__]
+          regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total
+          action: keep
+        - separator: ;
+          source_labels: [__name__, phase]
+          regex: kube_pod_status_phase;(Succeeded|Failed)
+          action: drop
+        - source_labels: [node]
+          target_label: nodename
+          regex: '(.*)'
+          action: replace
+        - source_labels: [label_node_group_name]
+          action: replace
+          regex: (.+)
+          target_label: label_node_pool_name
+
+      # cAdvisor container metrics for CPU and memory tracking
+      - job_name: kubernetes-cadvisor
+        metrics_path: /metrics
+        scheme: https
+        kubernetes_sd_configs:
+        - role: node
+          namespaces:
+            names: []
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+          insecure_skip_verify: false
+        metric_relabel_configs:
+        - separator: ;
+          source_labels: [__name__]
+          regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes
+          action: keep
+        relabel_configs:
+        - separator: ;
+          regex: __meta_kubernetes_node_label_(.+)
+          replacement: $1
+          action: labelmap
+        - separator: ;
+          regex: (.*)
+          target_label: __address__
+          replacement: kubernetes.default.svc:443
+          action: replace
+        - source_labels: [__meta_kubernetes_node_name]
+          separator: ;
+          regex: (.+)
+          target_label: __metrics_path__
+          replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+          action: replace
+
+      # Flyte propeller metrics for execution info and fast task duration
+      - job_name: flytepropeller
+        kubernetes_sd_configs:
+        - role: pod
+          namespaces:
+            names:
+            - union
+          selectors:
+          - role: pod
+            label: app.kubernetes.io/name=flytepropeller
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration"
+          action: keep
+        relabel_configs:
+        - source_labels: [__meta_kubernetes_pod_container_name]
+          regex: flytepropeller
+          action: keep
+        - source_labels: [__meta_kubernetes_namespace]
+          action: replace
+          target_label: namespace
+        - source_labels: [__meta_kubernetes_pod_name]
+          action: replace
+          target_label: pod
+      # OpenCost metrics for cost tracking
+      - job_name: opencost
+        static_configs:
+        - targets: ['release-name-opencost:9003']
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost"
+          action: keep
+  rules.yml: |
+    
+    groups:
+      - name: cost_calculations_15s
+        interval: 15s
+        rules:
+          - record: pod_gpu_allocation
+            expr: |
+              sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1))
+          - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions
+                          "label_entity_id", "$1", "execution_id", "(.*)" # join key
+                        ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id
+                      ),
+                      "label_execution_id", "$1", "execution_id", "(.*)"
+                    ),
+                    "label_project", "$1", "project", "(.*)" # project
+                  ),
+                  "label_domain", "$1", "domain", "(.*)" # domain
+                )
+              )
+          - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      kube_pod_labels{
+                        label_domain!="",
+                        label_project!="",
+                        label_serving_unionai_dev_app_name!="",
+                        label_serving_knative_dev_revision!=""
+                      }, # this filters for apps
+                      "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup
+                    ),
+                    "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity)
+                  ),
+                  "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key
+                )
+              )
+          - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_workspace_name, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces
+                    "label_entity_id", "$1", "label_node_id", "(.*)" # join key
+                  ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels
+                )
+              )
+          - record: fast_task_execution_duration
+            expr: |
+              max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""},
+                            "label_entity_id", "$1", "execution_id", "(.*)" # join key
+                          ),
+                          "label_execution_id", "$1", "execution_id", "(.*)"
+                        ),
+                        "label_project", "$1", "project", "(.*)" # project
+                      ),
+                      "label_domain", "$1", "domain", "(.*)" # domain
+                    ),
+                    "namespace", "$1", "exported_namespace", "(.*)"
+                  ),
+                  "pod", "$1", "exported_pod", "(.*)"
+                )
+              )
+          - record: fast_task_execution_duration_rate
+            expr: |
+              irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration
+          - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity
+                # First, calculate the allocated memory for each pod
+                max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory
+                  (
+                    sum by (namespace, pod) (
+                      container_memory_working_set_bytes{namespace!="",pod!="",image!=""}
+                    )
+                    > sum by (namespace, pod) (
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"}
+                    )
+                  )
+                  or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory
+                    kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe
+                  )
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, calculate the allocated cpu for each pod
+                max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu
+                  (
+                    sum by (namespace, pod) (
+                      irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m])
+                    )
+                    > sum by (namespace, pod) (
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"}
+                    )
+                  )
+                  or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"}
+                  )
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory)
+                max by (namespace, pod) (
+                  pod_gpu_allocation
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity
+                # First, calculate the used memory for each pod
+                sum by (namespace, pod) (
+                  container_memory_working_set_bytes{namespace!="",pod!="",image!=""}
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node)
+                entity_id:mem_usage_bytes_total_per_node:sum
+              )
+          - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) (
+                sum by (namespace, pod) (
+                  irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m])
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node)
+                entity_id:cpu_usage_per_node:sum
+              )
+          - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity
+            expr: |
+              avg by (label_entity_type, label_domain, label_project, label_entity_id) (
+                # First, grab the SM occupancy for each pod
+                max by (namespace, pod) (
+                  DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory)
+                max by (namespace, pod) (
+                  pod_gpu_allocation
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs)
+            expr: |
+              entity_id:sm_occupancy:avg
+              * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum
+          - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type) (
+                entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type)(
+                entity_id:cpu_usage_per_node:sum
+                * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type)(
+                entity_id:gpu_usage_per_node:sum
+                * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app.
+            expr: |
+              label_replace(
+                sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                  entity_id:allocated_mem_cost:sum
+                  or
+                  entity_id:allocated_cpu_cost:sum
+                  or
+                  entity_id:allocated_gpu_cost:sum
+                ),
+                "type", "allocated", "", "" # add type info
+              )
+          - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app)
+            expr: |
+              label_replace(
+                sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id)
+                  # Start with each execution's and app's allocated cost per node
+                  sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                    entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                    * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                    or
+                    entity_id:cpu_usage_per_node:sum
+                    * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                    or
+                    entity_id:gpu_usage_per_node:sum
+                    * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                  )
+                  # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity
+                  / on (node) group_left()(
+                    sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                      entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                      * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                      or
+                      entity_id:cpu_usage_per_node:sum
+                      * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                      or
+                      entity_id:gpu_usage_per_node:sum
+                      * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                    )
+                    > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts
+                  )
+                  # Then multiply by the overhead cost per node
+                  * on (node) group_left() (
+                    # To calculate overhead, start with the true cost of running each node
+                    avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes
+                    * on (node) max by (node) (
+                      node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts
+                    ) * (15 / 3600) # convert hourly cost to 15-secondly cost
+                    # Then subtract out the total allocated cost on each node
+                    - on (node) group_left()(
+                      sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                        entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                        * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                        or
+                        entity_id:cpu_usage_per_node:sum
+                        * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                        or
+                        entity_id:gpu_usage_per_node:sum
+                        * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                      )
+                    )
+                  )
+                ),
+                "type", "overhead", "", "" # add type info
+              )
+          - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs)
+            expr: |
+              label_replace(
+                sum by (label_domain, label_project, label_entity_id, label_entity_type) (
+                  entity_id:allocated_cost:sum
+                  or
+                  entity_id:overhead_cost:sum
+                ),
+                "type", "total", "", "" # add type info
+              )
+          - record: node:total_cost:sum # Total cost of all nodes
+            expr: |
+              sum (
+                avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes
+                * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost
+              )
+          - record: node_type:total_cost:sum # Total cost of nodes grouped by node type
+            expr: |
+              sum by (node_type)(
+                avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes
+                * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label
+              )
+          - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type
+            expr: |
+              sum by (node_type)(
+                avg by (node, node_type)( # dedupe
+                  label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel
+                )
+              ) * (15 / 3600) # convert to number of hours per 15-second observation      # Aggregate the above into visible metrics
+      - name: cost_rollup_15m
+        interval: 15m
+        rules:
+          - record: execution_info15m
+            expr: |
+              max_over_time(execution_info[15m:15s])
+          - record: app_info15m
+            expr: |
+              max_over_time(app_info[15m:15s])
+          - record: workspace_info15m
+            expr: |
+              max_over_time(workspace_info[15m:15s])
+          - record: entity_id:allocated_mem_bytes:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s])
+          - record: entity_id:used_mem_bytes:sum15m
+            expr: |
+              sum_over_time(entity_id:used_mem_bytes:sum[15m:15s])
+          - record: entity_id:allocated_cpu:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cpu:sum[15m:15s])
+          - record: entity_id:used_cpu:sum15m
+            expr: |
+              sum_over_time(entity_id:used_cpu:sum[15m:15s])
+          - record: entity_id:weighted_sm_occupancy:sum15m
+            expr: |
+              sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s])
+          - record: entity_id:gpu_count:sum15m
+            expr: |
+              sum_over_time(entity_id:gpu_count:sum[15m:15s])
+          - record: entity_id:allocated_mem_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s])
+          - record: entity_id:allocated_cpu_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s])
+          - record: entity_id:allocated_gpu_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s])
+          - record: entity_id:allocated_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cost:sum[15m:15s])
+          - record: entity_id:overhead_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:overhead_cost:sum[15m:15s])
+          - record: entity_id:total_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:total_cost:sum[15m:15s])
+          - record: node:total_cost:sum15m
+            expr: |
+              sum_over_time(node:total_cost:sum[15m:15s])
+          - record: node_type:total_cost:sum15m
+            expr: |
+              sum_over_time(node_type:total_cost:sum[15m:15s])
+          - record: node_type:uptime_hours:sum15m
+            expr: |
+              sum_over_time(node_type:uptime_hours:sum[15m:15s])
+---
+# Source: dataplane/templates/propeller/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: flyte-propeller-config
+  namespace: union
+data:
+  admin.yaml: | 
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    event:
+      capacity: 1000
+      rate: 500
+      type: admin
+  catalog.yaml: | 
+    catalog-cache:
+      cache-endpoint: dns:///union.test.union.ai
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+      type: fallback
+      use-admin-auth: true
+  copilot.yaml: | 
+    plugins:
+      k8s:
+        co-pilot:
+          image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1'
+          name: flyte-copilot-
+          start-timeout: 30s
+  core.yaml: | 
+    propeller:
+      downstream-eval-duration: 30s
+      enable-admin-launcher: true
+      leader-election:
+        enabled: true
+        lease-duration: 15s
+        lock-config-map:
+          name: propeller-leader
+          namespace: 'union'
+        renew-deadline: 10s
+        retry-period: 2s
+      limit-namespace: all
+      literal-offloading-config:
+        enabled: true
+      max-workflow-retries: 30
+      metadata-prefix: metadata/propeller
+      metrics-prefix: flyte
+      prof-port: 10254
+      queue:
+        batch-size: -1
+        batching-interval: 2s
+        queue:
+          base-delay: 5s
+          capacity: 1000
+          max-delay: 120s
+          rate: 100
+          type: maxof
+        sub-queue:
+          capacity: 100
+          rate: 10
+          type: bucket
+        type: batch
+      rawoutput-prefix: 's3://test-bucket'
+      workers: 4
+      workflow-reeval-duration: 30s
+    webhook:
+      certDir: /etc/webhook/certs
+      embeddedSecretManagerConfig:
+        imagePullSecrets:
+          enabled: true
+        k8sConfig:
+          namespace: 'union'
+        type: 'K8s'
+      listenPort: '9443'
+      secretManagerTypes:
+      - Embedded
+      - K8s
+      serviceName: flyte-pod-webhook
+      servicePort: '443'
+  enabled_plugins.yaml: | 
+    plugins:
+      connector-service:
+        defaultConnector:
+          defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}'
+          endpoint: k8s:///flyteconnector.union:8000
+    tasks:
+      task-plugins:
+        default-for-task-types:
+          actor: fast-task
+          container: container
+          container_array: k8s-array
+          sidecar: sidecar
+        enabled-plugins:
+        - container
+        - sidecar
+        - k8s-array
+        - echo
+        - fast-task
+        - connector-service
+  k8s.yaml: | 
+    plugins:
+      k8s:
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+  resource_manager.yaml: | 
+    propeller:
+      resourcemanager:
+        type: noop
+  task_logs.yaml: | 
+    plugins:
+      logs:
+        cloudwatch-enabled: false
+        dynamic-log-links:
+        - vscode:
+            displayName: VS Code Debugger
+            templateUris:
+            - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/
+        kubernetes-enabled: false
+        templates:
+        - displayName: Task Logs
+          scheme: TaskExecution
+          templateUris:
+          - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true
+  storage.yaml: | 
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+---
+# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: release-name-fluentbit
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - namespaces
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/charts/kube-state-metrics/templates/role.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+rules:
+
+- apiGroups: ["certificates.k8s.io"]
+  resources:
+  - certificatesigningrequests
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - cronjobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - daemonsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - deployments
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - endpoints
+  verbs: ["list", "watch"]
+
+- apiGroups: ["autoscaling"]
+  resources:
+  - horizontalpodautoscalers
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "networking.k8s.io"]
+  resources:
+  - ingresses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - jobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["coordination.k8s.io"]
+  resources:
+  - leases
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - limitranges
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - mutatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - namespaces
+  verbs: ["list", "watch"]
+
+- apiGroups: ["networking.k8s.io"]
+  resources:
+  - networkpolicies
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - nodes
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumeclaims
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumes
+  verbs: ["list", "watch"]
+
+- apiGroups: ["policy"]
+  resources:
+    - poddisruptionbudgets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - pods
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - replicasets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - replicationcontrollers
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - resourcequotas
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - secrets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - services
+  verbs: ["list", "watch"]
+
+- apiGroups: ["apps"]
+  resources:
+  - statefulsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - storageclasses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - validatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - volumeattachments
+  verbs: ["list", "watch"]
+---
+# Source: dataplane/charts/opencost/templates/clusterrole.yaml
+# Cluster role giving opencost to get, list, watch required resources
+# No write permissions are required
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: release-name-opencost
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups: [""]
+    resources:
+      - configmaps
+      - deployments
+      - nodes
+      - pods
+      - services
+      - resourcequotas
+      - replicationcontrollers
+      - limitranges
+      - persistentvolumeclaims
+      - persistentvolumes
+      - namespaces
+      - endpoints
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - extensions
+    resources:
+      - daemonsets
+      - deployments
+      - replicasets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - apps
+    resources:
+      - statefulsets
+      - deployments
+      - daemonsets
+      - replicasets
+    verbs:
+      - list
+      - watch
+  - apiGroups:
+      - batch
+    resources:
+      - cronjobs
+      - jobs
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - autoscaling
+    resources:
+      - horizontalpodautoscalers
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - policy
+    resources:
+      - poddisruptionbudgets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - storage.k8s.io
+    resources:
+      - storageclasses
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-clustersync-resource
+rules:
+  - apiGroups:
+      - ""
+      - rbac.authorization.k8s.io
+    resources:
+      - configmaps
+      - namespaces
+      - pods
+      - resourcequotas
+      - roles
+      - rolebindings
+      - secrets
+      - services
+      - serviceaccounts
+      - clusterrolebindings
+      - podtemplates
+    verbs:
+      - '*'
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-executor
+  labels:
+    app: executor
+rules:
+# Allow RO access to PODS
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
+# Allow Event recording access
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - update
+  - delete
+  - patch
+# Allow Access All plugin objects
+- apiGroups:
+  - '*'
+  resources:
+  - '*'
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - delete
+  - patch
+# Allow Access to CRD
+- apiGroups:
+  - apiextensions.k8s.io
+  resources:
+  - customresourcedefinitions
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - delete
+  - update
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - events
+      - flyteworkflows
+      - pods/log
+      - pods
+      - rayjobs
+      - resourcequotas
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  # Allow Access to all resources under flyte.lyft.com
+  - apiGroups:
+      - flyte.lyft.com
+    resources:
+      - flyteworkflows
+      - flyteworkflows/finalizers
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+      - post
+      - deletecollection
+  - apiGroups:
+      - '*'
+    resources:
+      - resourcequotas
+      - pods
+      - configmaps
+      - podtemplates
+      - secrets
+      - namespaces
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+  - nonResourceURLs:
+      - /metrics
+    verbs:
+      - get
+---
+# Source: dataplane/templates/prometheus/rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-operator-prometheus
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - pods
+      - endpoints
+      - services
+    verbs:
+      - get
+      - list
+      - watch
+  - nonResourceURLs:
+      - /metrics
+      - /metrics/cadvisor
+    verbs:
+      - get
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: flytepropeller-webhook-role
+  namespace: union
+rules:
+  - apiGroups:
+      - "*"
+    resources:
+      - mutatingwebhookconfigurations
+      - secrets
+      - pods
+      - replicasets/finalizers
+    verbs:
+      - get
+      - create
+      - update
+      - patch
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: flytepropeller-role
+rules:
+  # Allow RO access to PODS
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+  # Allow Event recording access
+  - apiGroups:
+      - ""
+    resources:
+      - events
+    verbs:
+      - create
+      - update
+      - delete
+      - patch
+  # Allow Access All plugin objects
+  - apiGroups:
+      - '*'
+    resources:
+      - '*'
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+  # Allow Access to CRD
+  - apiGroups:
+      - apiextensions.k8s.io
+    resources:
+      - customresourcedefinitions
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - delete
+      - update
+  # Allow Access to all resources under flyte.lyft.com
+  - apiGroups:
+      - flyte.lyft.com
+    resources:
+      - flyteworkflows
+      - flyteworkflows/finalizers
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+      - post
+      - deletecollection
+---
+# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: release-name-fluentbit
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-fluentbit
+subjects:
+  - kind: ServiceAccount
+    name: fluentbit-system
+    namespace: union
+---
+# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-kube-state-metrics
+subjects:
+- kind: ServiceAccount
+  name: release-name-kube-state-metrics
+  namespace: union
+---
+# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: release-name-opencost
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-opencost
+subjects:
+  - kind: ServiceAccount
+    name: release-name-opencost
+    namespace: union
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-clustersync-resource
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-clustersync-resource
+subjects:
+  - kind: ServiceAccount
+    name: union-clustersync-system
+    namespace: union
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-clustersync-auth-delegator
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: system:auth-delegator
+subjects:
+  - kind: ServiceAccount
+    name: union-clustersync-system
+    namespace: union
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-executor
+  labels:
+    app: executor
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-executor
+subjects:
+- kind: ServiceAccount
+  name: executor
+  namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: proxy-system
+subjects:
+  - kind: ServiceAccount
+    name: proxy-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: operator-system
+subjects:
+  - kind: ServiceAccount
+    name: operator-system
+    namespace: union
+---
+# Source: dataplane/templates/prometheus/rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-operator-prometheus
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-operator-prometheus
+subjects:
+  - kind: ServiceAccount
+    name: union-operator-prometheus
+    namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+# Create a binding from Role -> ServiceAccount
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: flytepropeller-webhook-binding
+  namespace: union
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: flytepropeller-webhook-role
+subjects:
+  - kind: ServiceAccount
+    name: flytepropeller-webhook-system
+    namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: flytepropeller-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: flytepropeller-role
+subjects:
+  - kind: ServiceAccount
+    name: flytepropeller-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: proxy-system-secret
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - secrets
+    verbs:
+      - get
+      - list
+      - create
+      - update
+      - delete
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - secrets
+      - deployments
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: proxy-system-secret
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: proxy-system-secret
+subjects:
+  - kind: ServiceAccount
+    name: proxy-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: operator-system
+subjects:
+  - kind: ServiceAccount
+    name: operator-system
+    namespace: union
+---
+# Source: dataplane/charts/fluentbit/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-fluentbit
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 2020
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/kube-state-metrics/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-kube-state-metrics
+  namespace: union
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  annotations:
+    prometheus.io/scrape: 'true'
+spec:
+  type: "ClusterIP"
+  ports:
+  - name: "http"
+    protocol: TCP
+    port: 8080
+    targetPort: 8080
+  
+  selector:    
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/opencost/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+  type: "ClusterIP"
+  ports:
+    - name: http
+      port: 9003
+      targetPort: 9003
+---
+# Source: dataplane/templates/clusterresourcesync/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: syncresources
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+      app.kubernetes.io/name: clusterresourcesync
+      app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/flyteconnector/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  clusterIP: None
+  ports:
+    - name: grpc
+      port: 8000
+      protocol: TCP
+      targetPort: grpc
+    - name: metric
+      port: 9090
+      protocol: TCP
+      targetPort: metric
+  selector: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/imagebuilder/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-buildkit
+  labels:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 1234
+      targetPort: tcp
+      protocol: TCP
+      name: tcp
+  selector:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/nodeexecutor/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-dataplane-executor
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app: executor
+spec:
+  type: ClusterIP
+  ports:
+    - port: 15605
+      targetPort: 15605
+      protocol: TCP
+      name: fasttask
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app: executor
+---
+# Source: dataplane/templates/operator/service-proxy.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-proxy
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8080
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/operator/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/prometheus/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 9090
+      protocol: TCP
+      name: http
+  selector:
+    app.kubernetes.io/component: prometheus
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/propeller/service-webhook.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyte-pod-webhook
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+    projectcontour.io/upstream-protocol.h2c: grpc
+spec:
+  selector:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+  ports:
+    - name: https
+      protocol: TCP
+      port: 443
+      targetPort: 9443
+    - name: debug
+      protocol: TCP
+      port: 10254
+      targetPort: 10254
+---
+# Source: dataplane/templates/propeller/service-webhook.yaml
+# Headless Service for cache invalidation — resolves to all pod IPs so that
+# we can fan out invalidation requests to every webhook replica.
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyte-pod-webhook-headless
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  clusterIP: None
+  selector:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+  ports:
+    - name: cache-internal
+      protocol: TCP
+      port: 9443
+      targetPort: 9443
+---
+# Source: dataplane/templates/propeller/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  namespace: union
+  name: flytepropeller
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - name: debug
+      protocol: TCP
+      port: 10254
+    - name: fasttask
+      port: 15605
+      protocol: TCP
+      targetPort: 15605
+  selector:
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/fluentbit/templates/daemonset.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: release-name-fluentbit
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fluentbit
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: fluentbit
+        app.kubernetes.io/instance: release-name
+      annotations:
+        checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+    spec:
+      serviceAccountName: fluentbit-system
+      hostNetwork: false
+      dnsPolicy: ClusterFirst
+      containers:
+        - name: fluentbit
+          image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8"
+          imagePullPolicy: IfNotPresent
+          command:
+            - /fluent-bit/bin/fluent-bit
+          args:
+            - --workdir=/fluent-bit/etc
+            - --config=/fluent-bit/etc/conf/fluent-bit.conf
+          ports:
+            - name: http
+              containerPort: 2020
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /
+              port: http
+          readinessProbe:
+            httpGet:
+              path: /api/v1/health
+              port: http
+          volumeMounts:
+            - name: config
+              mountPath: /fluent-bit/etc/conf
+            - mountPath: /var/log
+              name: varlog
+            - mountPath: /var/lib/docker/containers
+              name: varlibdockercontainers
+              readOnly: true
+            - mountPath: /etc/machine-id
+              name: etcmachineid
+              readOnly: true
+      volumes:
+        - name: config
+          configMap:
+            name: fluentbit-system
+        - hostPath:
+            path: /var/log
+          name: varlog
+        - hostPath:
+            path: /var/lib/docker/containers
+          name: varlibdockercontainers
+        - hostPath:
+            path: /etc/machine-id
+            type: File
+          name: etcmachineid
+      tolerations:
+        - operator: Exists
+---
+# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: release-name-kube-state-metrics
+  namespace: union
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+spec:
+  selector:
+    matchLabels:      
+      app.kubernetes.io/name: kube-state-metrics
+      app.kubernetes.io/instance: release-name
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  revisionHistoryLimit: 10
+  template:
+    metadata:
+      labels:        
+        helm.sh/chart: kube-state-metrics-5.30.1
+        app.kubernetes.io/managed-by: Helm
+        app.kubernetes.io/component: metrics
+        app.kubernetes.io/part-of: kube-state-metrics
+        app.kubernetes.io/name: kube-state-metrics
+        app.kubernetes.io/instance: release-name
+        app.kubernetes.io/version: "2.15.0"
+    spec:
+      automountServiceAccountToken: true
+      hostNetwork: false
+      serviceAccountName: release-name-kube-state-metrics
+      securityContext:
+        fsGroup: 65534
+        runAsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - name: kube-state-metrics
+        args:
+        - --port=8080
+        - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
+        imagePullPolicy: IfNotPresent
+        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0
+        ports:
+        - containerPort: 8080
+          name: "http"
+        livenessProbe:
+          failureThreshold: 3
+          httpGet:
+            httpHeaders:
+            path: /livez
+            port: 8080
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 5
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            httpHeaders:
+            path: /readyz
+            port: 8081
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 5
+        resources:
+          {}
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          readOnlyRootFilesystem: true
+---
+# Source: dataplane/charts/opencost/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: opencost
+      app.kubernetes.io/instance: release-name
+  strategy: 
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 1
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: opencost
+        app.kubernetes.io/instance: release-name
+    spec:
+      serviceAccountName: release-name-opencost
+      containers:
+        - name: release-name-opencost
+          image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97
+          imagePullPolicy: IfNotPresent
+          args:
+          ports:
+            - containerPort: 9003
+              name: http
+          resources:
+            limits:
+              cpu: 1000m
+              memory: 4Gi
+            requests:
+              cpu: 500m
+              memory: 1Gi
+          startupProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            failureThreshold: 30
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 20
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 10
+            failureThreshold: 3
+          env:
+            - name: LOG_LEVEL
+              value: info
+            - name: CUSTOM_COST_ENABLED
+              value: "false"
+            - name: KUBECOST_NAMESPACE
+              value: union
+            - name: API_PORT
+              value: "9003"
+            - name: PROMETHEUS_SERVER_ENDPOINT
+              value: "http://union-operator-prometheus.union.svc:80/prometheus"
+            - name: CLUSTER_ID
+              value: "default-cluster"
+            - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS
+              value: "15"
+            - name: CLOUD_COST_ENABLED
+              value: "false"
+            - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL
+              value: "6"
+            - name: CLOUD_COST_REFRESH_RATE_HOURS
+              value: "6"
+            - name: CLOUD_COST_QUERY_WINDOW_DAYS
+              value: "7"
+            - name: CLOUD_COST_RUN_WINDOW_DAYS
+              value: "3"
+            # Add any additional provided variables
+---
+# Source: dataplane/templates/clusterresourcesync/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-syncresources
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: clusterresourcesync
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "b511750d960c272bb6a4f3ddbbfd46cfcaf0f7dfa7c3e4348c14af517722b00"
+        
+      labels:
+        
+        app.kubernetes.io/name: clusterresourcesync
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      containers:
+        - command:
+            - clusterresource
+            - --config
+            - /etc/flyte/config/*.yaml
+            - clusterresource
+            - run
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          name: sync-cluster-resources
+          resources:
+            limits:
+              cpu: "1"
+              memory: 500Mi
+            requests:
+              cpu: 500m
+              memory: 100Mi
+          volumeMounts:
+            - name: auth
+              mountPath: /etc/union/secret
+            - name: resource-templates
+              mountPath: /etc/flyte/clusterresource/templates
+            - name: config-volume
+              mountPath: /etc/flyte/config
+          ports:
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+      serviceAccountName: union-clustersync-system
+      volumes:
+        - configMap:
+            name: union-clusterresource-template
+          name: resource-templates
+        - configMap:
+            name: union-clusterresourcesync-config
+          name: config-volume
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/flyteconnector/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 2
+  selector:
+    matchLabels: 
+      app.kubernetes.io/name: flyteconnector
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+      labels: 
+        app.kubernetes.io/name: flyteconnector
+        app.kubernetes.io/instance: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      containers:
+        - command:
+            - flyte
+            - serve
+            - connector
+          image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122"
+          imagePullPolicy: "IfNotPresent"
+          name: flyteconnector
+          readinessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /
+              port: 9090
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 5
+          env:
+            - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT
+              value: "google-cloud-platform://"
+            - name: LOG_LEVEL
+              value: "10"
+          ports:
+            - containerPort: 8000
+              name: grpc
+            - containerPort: 9090
+              name: metric
+          resources:
+            limits:
+              cpu: "1.5"
+              ephemeral-storage: 100Mi
+              memory: 1500Mi
+            requests:
+              cpu: "1"
+              ephemeral-storage: 100Mi
+              memory: 1000Mi
+      serviceAccountName: flyteconnector
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/imagebuilder/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-buildkit
+  labels:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  strategy:
+    type: Recreate
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: imagebuilder-buildkit
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        container.apparmor.security.beta.kubernetes.io/buildkit: unconfined
+      labels:
+        app.kubernetes.io/name: imagebuilder-buildkit
+        app.kubernetes.io/instance: release-name
+    spec:
+      serviceAccountName: "union-imagebuilder"
+      containers:
+        - name: "buildkit"
+          image: "docker.io/moby/buildkit:buildx-stable-1-rootless"
+          imagePullPolicy: IfNotPresent
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: GOMEMLIMIT
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.memory
+            - name: GOMAXPROCS
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.cpu
+            - name: CLUSTER_NAME
+              valueFrom:
+                secretKeyRef:
+                  name: operator-cluster-name
+                  key: cluster_name
+            - name: DEPLOYMENT_NAME
+              value: operator
+            - name: PROXY_SERVICE_URL
+              value: http://union-operator-proxy:8080
+            - name: PROMETHEUS_SERVICE_URL
+              value: http://union-operator-prometheus:80
+            - name: KNATIVE_PROXY_SERVICE_URL
+              value: http://kourier-internal
+          volumeMounts:
+            - mountPath: /home/user/.local/share/buildkit
+              name: buildkitd
+            - mountPath: /etc/buildkit
+              name: buildkit-config
+          args:
+            - --config
+            - /etc/buildkit/buildkitd.toml
+            - --addr
+            - unix:///run/user/1000/buildkit/buildkitd.sock
+            - --addr
+            - tcp://0.0.0.0:1234
+            - --oci-worker-no-process-sandbox
+          ports:
+            - name: tcp
+              containerPort: 1234
+              protocol: TCP
+          readinessProbe:
+            exec:
+              command:
+              - buildctl
+              - debug
+              - workers
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          livenessProbe:
+            exec:
+              command:
+              - buildctl
+              - debug
+              - workers
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          securityContext:
+            seccompProfile: # Needs Kubernetes >= 1.19
+              type: Unconfined
+            runAsUser: 1000
+            runAsGroup: 1000
+          resources:
+            requests:
+              cpu: 1
+              ephemeral-storage: 20Gi
+              memory: 1Gi
+      volumes:
+      - name: buildkitd
+        emptyDir: {}
+      - configMap:
+          name: union-operator-buildkit
+        name: buildkit-config
+      
+      nodeSelector:
+        app_pool: flyte
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchLabels:
+                app.kubernetes.io/name: imagebuilder-buildkit
+                app.kubernetes.io/instance: release-name
+            topologyKey: "kubernetes.io/hostname"
+      
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/nodeexecutor/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: executor
+  template:
+    metadata:
+      annotations:
+        configChecksum: "3d931e5636192b94c904aa780a60effc2bb71861f72f22b448e711b33d41918"
+        
+      labels:
+        
+        app: executor
+    spec:
+      securityContext:
+        fsGroup: 1337
+      serviceAccountName: executor
+      volumes:
+        - name: config-volume
+          configMap:
+            name: executor
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      containers:
+        - name: executor
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          command:
+            - executor
+            - serve
+            - --config
+            - /etc/config/*.yaml
+          ports:
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: GOMEMLIMIT
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.memory
+            - name: GOMAXPROCS
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.cpu
+            - name: CLUSTER_NAME
+              valueFrom:
+                secretKeyRef:
+                  name: operator-cluster-name
+                  key: cluster_name
+            - name: DEPLOYMENT_NAME
+              value: operator
+            - name: PROXY_SERVICE_URL
+              value: http://union-operator-proxy:8080
+            - name: PROMETHEUS_SERVICE_URL
+              value: http://union-operator-prometheus:80
+            - name: KNATIVE_PROXY_SERVICE_URL
+              value: http://kourier-internal
+          resources:
+            limits:
+              cpu:    "4"
+              memory: "8Gi"
+            requests:
+              cpu:    "1"
+              memory: "1Gi"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/config
+            - name: secret-volume
+              mountPath: /etc/union/secret
+            - name: auth
+              mountPath: /etc/secrets/
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/operator/deployment-proxy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-proxy
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: operator-proxy
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad"
+        
+      labels:
+        
+        app.kubernetes.io/name: operator-proxy
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      volumes:
+        - name: config-volume
+          projected:
+            sources:
+            - configMap:
+                name: union-operator
+            - configMap:
+                name: union-clusterresourcesync-config
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+      serviceAccountName: proxy-system
+      securityContext:
+        {}
+      containers:
+        - name: operator-proxy
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          terminationMessagePolicy: FallbackToLogsOnError
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: 500m
+              memory: 500Mi
+          volumeMounts:
+            - mountPath: /etc/union/config
+              name: config-volume
+            - mountPath: /etc/union/secret
+              name: secret-volume
+          args:
+            - operator
+            - proxy
+            - --config
+            - /etc/union/config/*.yaml
+          ports:
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: connect
+              containerPort: 8080
+              protocol: TCP
+            - name: grpc
+              containerPort: 8081
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+        - name: "tunnel"
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          args:
+            - cloudflared
+            - tunnel
+            - --no-autoupdate
+            - run
+            - --token
+            - $(TUNNEL_TOKEN)
+          env:
+            - name: TUNNEL_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: union-secret-auth
+                  key: tunnel_token
+                  optional: true
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: 500m
+              memory: 500Mi
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/operator/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: union-operator
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad"
+        
+      labels:
+        
+        app.kubernetes.io/name: union-operator
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      serviceAccountName: operator-system
+      securityContext:
+        {}
+      volumes:
+        - name: config-volume
+          configMap:
+            name: union-operator
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+      containers:
+        - name: operator
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          terminationMessagePolicy: FallbackToLogsOnError
+          resources:
+            limits:
+              cpu: "2"
+              memory: 3Gi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          volumeMounts:
+            - mountPath: /etc/union/config
+              name: config-volume
+            - mountPath: /etc/union/secret
+              name: secret-volume
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          args:
+            - operator
+            - serve
+            - --config
+            - /etc/union/config/*.yaml
+            - --operator.clusterId.name
+            - "$(CLUSTER_NAME)"
+            - --operator.tunnel.k8sSecretName
+            - union-secret-auth
+          ports:
+            - name: grpc
+              containerPort: 8080
+              protocol: TCP
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/prometheus/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: prometheus
+      app.kubernetes.io/name: release-name-dataplane
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "9505483b28e45abfefda9a9791a7719382b61225386ddfbdfea71a459a1423e"
+      labels:
+        app.kubernetes.io/component: prometheus
+        app.kubernetes.io/name: release-name-dataplane
+        app.kubernetes.io/instance: release-name
+    spec:
+      priorityClassName: system-cluster-critical
+      serviceAccountName: union-operator-prometheus
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+        fsGroupChangePolicy: OnRootMismatch
+      containers:
+        - name: prometheus
+          image: "prom/prometheus:v3.3.1"
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --web.external-url=/prometheus/
+            - --web.route-prefix=/prometheus/
+            - --storage.tsdb.retention.time=3d
+          ports:
+            - name: http
+              containerPort: 9090
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3500Mi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          securityContext:
+            allowPrivilegeEscalation: false
+          terminationMessagePolicy: FallbackToLogsOnError
+          volumeMounts:
+            - mountPath: /etc/prometheus
+              name: prometheus-config
+      volumes:
+        - name: prometheus-config
+          configMap:
+            name: union-operator-prometheus
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/propeller/deployment-webhook.yaml
+# Create the actual deployment
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flytepropeller-webhook
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: flyte-pod-webhook
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      labels:
+        
+        app.kubernetes.io/name: flyte-pod-webhook
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+      annotations:
+        configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e"
+        
+    spec:
+      securityContext:
+        fsGroup: 65534
+        fsGroupChangePolicy: Always
+        runAsNonRoot: true
+        runAsUser: 1001
+        seLinuxOptions:
+          type: spc_t
+      serviceAccountName: flytepropeller-webhook-system
+      initContainers:
+        - name: generate-secrets
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          command:
+            - flytepropeller
+          args:
+            - webhook
+            - init-certs
+            - --config
+            - /etc/flyte/config/*.yaml
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+          resources:
+            limits:
+              cpu: 1
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+            requests:
+              cpu: 200m
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+      containers:
+        - name: webhook
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          command:
+            - flytepropeller
+          args:
+            - webhook
+            - --config
+            - /etc/flyte/config/*.yaml
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          ports:
+            - containerPort: 9443
+            - containerPort: 10254
+          resources:
+            limits:
+              cpu: 1
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+            requests:
+              cpu: 200m
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+              readOnly: true
+            - name: webhook-certs
+              mountPath: /etc/webhook/certs
+              readOnly: true
+      volumes:
+        - name: config-volume
+          configMap:
+            name: flyte-propeller-config
+        - name: webhook-certs
+          secret:
+            secretName: flyte-pod-webhook
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/propeller/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: union
+  name: flytepropeller
+  labels:
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: flytepropeller
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e"
+        
+      labels:
+        
+        
+        app.kubernetes.io/name: flytepropeller
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      priorityClassName: system-cluster-critical
+      containers:
+        - command:
+            - flytepropeller
+            - --config
+            - /etc/flyte/config/*.yaml
+            - --propeller.cluster-id
+            - union-test
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          name: flytepropeller
+          ports:
+            - containerPort: 10254
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+            - name: auth
+              mountPath: /etc/union/secret
+      serviceAccountName: flytepropeller-system
+      volumes:
+        - configMap:
+            name: flyte-propeller-config
+          name: config-volume
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - flyte
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/flyteconnector/hpa.yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: flyteconnector
+  labels:
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: flyteconnector
+  minReplicas: 2
+  maxReplicas: 5
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 80
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: 80
+---
+# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "release-name-fluentbit-test-connection"
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+    helm.sh/hook: test
+    helm.sh/hook-delete-policy: hook-succeeded
+spec:
+  containers:
+    - name: wget
+      image: "busybox:latest"
+      imagePullPolicy: Always
+      command: ["sh"]
+      args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"]
+  restartPolicy: Never
diff --git a/tests/generated/dataplane.oci.yaml b/tests/generated/dataplane.oci.yaml
index 6c239916..d21fa8ac 100644
--- a/tests/generated/dataplane.oci.yaml
+++ b/tests/generated/dataplane.oci.yaml
@@ -5490,6 +5490,9 @@ spec:
       - configMap:
           name: union-operator-buildkit
         name: buildkit-config
+      
+      nodeSelector:
+        flyte.org/node-role: worker
       affinity:
         podAntiAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
@@ -5498,6 +5501,12 @@ spec:
                 app.kubernetes.io/name: imagebuilder-buildkit
                 app.kubernetes.io/instance: release-name
             topologyKey: "kubernetes.io/hostname"
+      
+      tolerations:
+        - effect: NoSchedule
+          key: flyte.org/node-role
+          operator: Equal
+          value: worker
 ---
 # Source: dataplane/templates/nodeexecutor/deployment.yaml
 apiVersion: apps/v1
@@ -5951,6 +5960,14 @@ spec:
         - name: prometheus-config
           configMap:
             name: union-operator-prometheus
+      
+      nodeSelector:
+        flyte.org/node-role: worker
+      tolerations:
+        - effect: NoSchedule
+          key: flyte.org/node-role
+          operator: Equal
+          value: worker
 ---
 # Source: dataplane/templates/propeller/deployment-webhook.yaml
 # Create the actual deployment
diff --git a/tests/generated/dataplane.scheduling-override.yaml b/tests/generated/dataplane.scheduling-override.yaml
new file mode 100644
index 00000000..91377e51
--- /dev/null
+++ b/tests/generated/dataplane.scheduling-override.yaml
@@ -0,0 +1,6332 @@
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-development
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-staging
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flytesnacks-production
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-development
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-staging
+---
+# Source: dataplane/templates/common/namespaces.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: union-health-monitoring-production
+---
+# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: fluentbit-system
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+automountServiceAccountToken: true
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+  namespace: union
+---
+# Source: dataplane/charts/opencost/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+automountServiceAccountToken: true
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-clustersync-system
+  namespace: union
+---
+# Source: dataplane/templates/flyteconnector/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/imagebuilder/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-imagebuilder
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: dataplane/templates/prometheus/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flytepropeller-webhook-system
+  namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: flytepropeller-system
+  namespace: union
+---
+# Source: dataplane/templates/common/auth-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: union-secret-auth
+  namespace: union
+type: Opaque
+data:
+  # TODO(rob): update or configure operator to use client_secret like all the other components.
+  app_secret: dGVzdC1zZWNyZXQ=
+  client_secret: dGVzdC1zZWNyZXQ=
+---
+# Source: dataplane/templates/common/cluster-secret.yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: operator-cluster-name
+type: Opaque
+data:
+  cluster_name: dW5pb24tdGVzdA==
+---
+# Source: dataplane/templates/propeller/deployment-webhook.yaml
+# Create an empty secret that the first propeller pod will populate
+apiVersion: v1
+kind: Secret
+metadata:
+  name: flyte-pod-webhook
+  namespace: union
+type: Opaque
+---
+# Source: dataplane/templates/clusterresourcesync/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-clusterresourcesync-config
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  cluster_resources.yaml: | 
+    cluster_resources:
+      clusterName: 'union-test'
+      customData:
+      - production:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      - staging:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      - development:
+        - projectQuotaCpu:
+            value: "4096"
+        - projectQuotaMemory:
+            value: 2Ti
+        - projectQuotaNvidiaGpu:
+            value: "256"
+        - defaultUserRoleKey:
+            value: 'eks.amazonaws.com/role-arn'
+        - defaultUserRoleValue:
+            value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+      refreshInterval: 5m
+      standaloneDeployment: true
+      templatePath: /etc/flyte/clusterresource/templates
+    clusterResourcesPrivate:
+      app:
+        isServerless: false
+    union:
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+      connection:
+        host: dns:///union.test.union.ai
+  admin.yaml: | 
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    event:
+      capacity: 1000
+      rate: 500
+      type: admin
+  domain.yaml: | 
+    domains:
+    - id: development
+      name: development
+    - id: staging
+      name: staging
+    - id: production
+      name: production
+  clusters.yaml: |
+    clusters:
+      clusterConfigs: []
+      labelClusterMap: {}
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+---
+# Source: dataplane/templates/clusterresourcesync/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-clusterresource-template
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  a_namespace.yaml: | 
+    apiVersion: v1
+    kind: Namespace
+    metadata:
+      name: {{ namespace }}
+      labels:
+        union.ai/namespace-type: flyte
+    spec:
+      finalizers:
+      - kubernetes
+    
+  b_default_service_account.yaml: | 
+    apiVersion: v1
+    kind: ServiceAccount
+    metadata:
+      name: default
+      namespace: {{ namespace }}
+      annotations:
+        {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }}
+    
+  c_project_resource_quota.yaml: | 
+    apiVersion: v1
+    kind: ResourceQuota
+    metadata:
+      name: project-quota
+      namespace: {{ namespace }}
+    spec:
+      hard:
+        limits.cpu: {{ projectQuotaCpu }}
+        limits.memory: {{ projectQuotaMemory }}
+        requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }}
+---
+# Source: dataplane/templates/fluent-bit/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: fluentbit-system
+  namespace: union
+  labels:
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  custom_parsers.conf: |
+    [PARSER]
+        Name docker_no_time
+        Format json
+        Time_Keep Off
+        Time_Key time
+        Time_Format %Y-%m-%dT%H:%M:%S.%L
+  fluent-bit.conf: |
+    [SERVICE]
+        Parsers_File /fluent-bit/etc/parsers.conf
+        Parsers_File /fluent-bit/etc/conf/custom_parsers.conf
+        HTTP_Server On
+        HTTP_Listen 0.0.0.0
+        Health_Check On
+    [INPUT]
+        Name                tail
+        Tag                 namespace-<namespace_name>.pod-<pod_name>.cont-<container_name>
+        Tag_Regex           (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-
+        Path                /var/log/containers/*.log
+        DB                  /var/log/flb_kube.db
+        multiline.parser    docker, cri
+        Mem_Buf_Limit       5MB
+        Skip_Long_Lines     On
+        Refresh_Interval    10
+    
+    
+    [OUTPUT]
+        Name s3
+        Match *
+        upload_timeout 1m
+        s3_key_format /persisted-logs/$TAG
+        static_file_path true
+        json_date_key false
+        region us-east-1
+        bucket test-bucket
+---
+# Source: dataplane/templates/imagebuilder/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name : union-operator-buildkit
+data:
+  buildkitd.toml: |
+    debug = false
+
+    [log]
+      format = "text"
+
+    [worker.oci]
+      enabled = true
+      snapshotter = "auto"
+      gc = true
+      max-parallelism = 0
+
+      # Should not be used if Policies are defined
+      gckeepstorage = "10%"
+      [[worker.oci.gcpolicy]]
+        # Remove COPY/ADD and git checkout files
+        keepBytes = "10%"
+        keepDuration = "24h"
+        filters = [ "type==source.local", "type==source.git.checkout" ]
+      [[worker.oci.gcpolicy]]
+        # Remove locally cached image layers after it's unused for 24 hours
+        keepBytes = "10%"
+        keepDuration = "24h"
+        filters = [ "regular" ]
+      [[worker.oci.gcpolicy]]
+        # Remove shared cache mounts. E.G. Pip cache
+        keepBytes = "10%"
+        keepDuration = "72h"
+        filters = [ "type==exec.cachemount" ]
+      [[worker.oci.gcpolicy]]
+        # Remove everything else to keep the cache size under total file system limit
+        all = true
+        keepBytes = "80%"
+---
+# Source: dataplane/templates/monitoring/dashboard-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: release-name-dashboard-union-dataplane-overview
+  namespace: union
+  labels:
+    grafana_dashboard: "1"
+    app.kubernetes.io/managed-by: Helm
+data:
+  union-dataplane-overview.json: |-
+    {
+      "annotations": {
+        "list": []
+      },
+      "description": "Union Dataplane health and service metrics",
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 1,
+      "links": [],
+      "panels": [
+        {
+          "collapsed": false,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "id": 1,
+          "title": "Health",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.5
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "min": 0,
+              "max": 1
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 0,
+            "y": 1
+          },
+          "id": 2,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Service Availability",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "legendFormat": "Availability",
+              "refId": "A"
+            }
+          ],
+          "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 10
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 6,
+            "y": 1
+          },
+          "id": 3,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Pod Restarts (1h)",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))",
+              "legendFormat": "Restarts",
+              "refId": "A"
+            }
+          ],
+          "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 10
+                  },
+                  {
+                    "color": "red",
+                    "value": 50
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 18,
+            "y": 1
+          },
+          "id": 8,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "area",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "auto"
+          },
+          "title": "Active Workflows",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Workflows",
+              "refId": "A"
+            }
+          ],
+          "description": "Current active FlyteWorkflow CRD count managed by Propeller."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 5
+          },
+          "id": 10,
+          "options": {
+            "legend": {
+              "calcs": [
+                "mean",
+                "max"
+              ],
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi",
+              "sort": "desc"
+            }
+          },
+          "title": "Active Executions",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Workflows",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Nodes",
+              "refId": "B"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})",
+              "legendFormat": "Tasks",
+              "refId": "C"
+            }
+          ],
+          "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 5
+          },
+          "id": 7,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi",
+              "sort": "desc"
+            }
+          },
+          "title": "Queue Depth",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})",
+              "legendFormat": "Main",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})",
+              "legendFormat": "Sub",
+              "refId": "B"
+            }
+          ],
+          "description": "Main and sub workqueue depth over time."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "short"
+            }
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 12,
+            "y": 1
+          },
+          "id": 11,
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value_and_name"
+          },
+          "title": "Handler Panics",
+          "type": "stat",
+          "targets": [
+            {
+              "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})",
+              "legendFormat": "Total",
+              "refId": "A"
+            }
+          ],
+          "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling."
+        },
+        {
+          "collapsed": false,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 9
+          },
+          "id": 1200,
+          "title": "SLOs",
+          "type": "row"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.99
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.999
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 3
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 10
+          },
+          "id": 1201,
+          "title": "Service Availability",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "refId": "A"
+            }
+          ],
+          "description": "Current service availability across all DP deployments."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": -999
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.5
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 1,
+              "noValue": "N/A"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 10
+          },
+          "id": 1202,
+          "title": "Error Budget Remaining",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "area",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "union:dp:slo:error_budget_remaining",
+              "refId": "A"
+            }
+          ],
+          "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 0
+                  },
+                  {
+                    "color": "orange",
+                    "value": 0.95
+                  },
+                  {
+                    "color": "green",
+                    "value": 0.999
+                  }
+                ]
+              },
+              "unit": "percentunit",
+              "decimals": 2,
+              "noValue": "N/A"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 10
+          },
+          "id": 1203,
+          "title": "Execution Success Rate",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)",
+              "refId": "A"
+            }
+          ],
+          "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "thresholds": {
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              },
+              "unit": "s",
+              "decimals": 2
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 10
+          },
+          "id": 1204,
+          "title": "Propeller Latency p99",
+          "type": "stat",
+          "options": {
+            "colorMode": "background",
+            "graphMode": "none",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ]
+            },
+            "textMode": "value"
+          },
+          "targets": [
+            {
+              "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)",
+              "refId": "A"
+            }
+          ],
+          "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "percentunit"
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 16
+          },
+          "id": 1205,
+          "title": "Availability Over Time",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})",
+              "legendFormat": "Availability",
+              "refId": "A"
+            },
+            {
+              "expr": "vector(0.999)",
+              "legendFormat": "Target (99.9%)",
+              "refId": "B"
+            }
+          ],
+          "description": "DP service availability over time with SLO target line."
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "lineWidth": 1,
+                "showPoints": "never"
+              },
+              "unit": "percentunit",
+              "max": 1,
+              "min": -0.5
+            }
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 16
+          },
+          "id": 1206,
+          "title": "Error Budget Burn Rate",
+          "type": "timeseries",
+          "targets": [
+            {
+              "expr": "union:dp:slo:error_budget_remaining",
+              "legendFormat": "Budget remaining",
+              "refId": "A"
+            },
+            {
+              "expr": "vector(0)",
+              "legendFormat": "Exhausted",
+              "refId": "B"
+            }
+          ],
+          "description": "DP error budget remaining over time. Requires monitoring.slos.enabled."
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 31
+          },
+          "id": 200,
+          "title": "Union Operator",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 10
+              },
+              "id": 201,
+              "title": "Work Queue Operations",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Processed",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Failed",
+                  "refId": "B"
+                }
+              ],
+              "description": "Operator execution operation processing rate and failure rate."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 10
+              },
+              "id": 202,
+              "title": "Background Process Runs / Errors",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Heartbeat runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Heartbeat errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Status runs",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Status errors",
+                  "refId": "D"
+                },
+                {
+                  "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Prom health errors",
+                  "refId": "E"
+                }
+              ],
+              "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 10
+              },
+              "id": 203,
+              "title": "Heartbeat Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Capabilities p90",
+                  "refId": "A"
+                },
+                {
+                  "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Usages p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "List WFs p90",
+                  "refId": "C"
+                }
+              ],
+              "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 18
+              },
+              "id": 204,
+              "title": "Config Syncer",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sync runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sync errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Propeller CM updated",
+                  "refId": "C"
+                }
+              ],
+              "description": "Config sync cycle rate, errors, and propeller ConfigMap update count."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 18
+              },
+              "id": 205,
+              "title": "Billable Usage Collector",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Runs",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Errors",
+                  "refId": "B"
+                }
+              ],
+              "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "thresholds"
+                  },
+                  "thresholds": {
+                    "steps": [
+                      {
+                        "color": "green",
+                        "value": null
+                      },
+                      {
+                        "color": "red",
+                        "value": 1
+                      }
+                    ]
+                  },
+                  "unit": "bool_yes_no"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 18
+              },
+              "id": 206,
+              "title": "Work Queue Paused",
+              "type": "stat",
+              "options": {
+                "colorMode": "background",
+                "graphMode": "area",
+                "reduceOptions": {
+                  "calcs": [
+                    "lastNotNull"
+                  ]
+                },
+                "textMode": "auto"
+              },
+              "targets": [
+                {
+                  "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}",
+                  "legendFormat": "Paused",
+                  "refId": "A"
+                }
+              ],
+              "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 32
+          },
+          "id": 300,
+          "title": "Executor (V2)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 11
+              },
+              "id": 301,
+              "title": "Active Actions & Capacity",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "executor:active_actions_count{namespace=\"$namespace\"}",
+                  "legendFormat": "Active actions",
+                  "refId": "A"
+                },
+                {
+                  "expr": "executor:available_capacity{namespace=\"$namespace\"}",
+                  "legendFormat": "Available capacity",
+                  "refId": "B"
+                }
+              ],
+              "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 11
+              },
+              "id": 302,
+              "title": "Cache Discovery",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Miss",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Put success",
+                  "refId": "B"
+                }
+              ],
+              "description": "V2 executor cache discovery miss/put rates for task output caching."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 19
+              },
+              "id": 303,
+              "title": "Actions Terminated by Phase",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "{{ phase }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 19
+              },
+              "id": 304,
+              "title": "Evaluator Duration (pod creation)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "Evaluate p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Evaluate p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "Evaluate p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 19
+              },
+              "id": 305,
+              "title": "System Failures & Invalid Leases",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "System failures",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Exhausted retries",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Invalid leases",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Evaluate errors",
+                  "refId": "D"
+                }
+              ],
+              "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]"
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 33
+          },
+          "id": 100,
+          "title": "Flyte Propeller (V1)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 12
+              },
+              "id": 101,
+              "title": "Round Time (p50 / p90 / p99)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 12
+              },
+              "id": 102,
+              "title": "Round Success / Error Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Success",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Errors",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Panics",
+                  "refId": "C"
+                }
+              ],
+              "description": "Propeller round outcomes: success, errors, and panics per second."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 12
+              },
+              "id": 103,
+              "title": "Free Workers",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})",
+                  "legendFormat": "Free workers",
+                  "refId": "A"
+                }
+              ],
+              "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 0,
+                "y": 20
+              },
+              "id": 104,
+              "title": "Queue Add Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Main adds",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Sub adds",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Main retries",
+                  "refId": "C"
+                }
+              ],
+              "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 8,
+                "y": 20
+              },
+              "id": 105,
+              "title": "Workflow Updates",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Updated",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Failed",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Too large",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Conflict",
+                  "refId": "D"
+                }
+              ],
+              "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 8,
+                "x": 16,
+                "y": 20
+              },
+              "id": 106,
+              "title": "Workflow Update Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}",
+                  "legendFormat": "p99",
+                  "refId": "C"
+                }
+              ],
+              "description": "etcd write latency for FlyteWorkflow status updates."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ms"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 28
+              },
+              "id": 107,
+              "title": "Node Queueing & Execution Latency",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}",
+                  "legendFormat": "Queue p50",
+                  "refId": "A"
+                },
+                {
+                  "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}",
+                  "legendFormat": "Queue p90",
+                  "refId": "B"
+                },
+                {
+                  "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000",
+                  "legendFormat": "Exec p90 (ms)",
+                  "refId": "C"
+                }
+              ],
+              "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "percentunit"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 28
+              },
+              "id": 108,
+              "title": "Metastore Cache Hit Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Hit rate",
+                  "refId": "A"
+                }
+              ],
+              "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "ops"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 36
+              },
+              "id": 109,
+              "title": "Event Recording (DP \u2192 CP)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Task success",
+                  "refId": "A"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Node success",
+                  "refId": "B"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Task failure",
+                  "refId": "C"
+                },
+                {
+                  "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "Node failure",
+                  "refId": "D"
+                }
+              ],
+              "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 36
+              },
+              "id": 110,
+              "title": "Cache Discovery (hit/miss/skip)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Hits",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Misses",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Skips",
+                  "refId": "C"
+                },
+                {
+                  "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "Get failures",
+                  "refId": "D"
+                }
+              ],
+              "description": "V2 executor cache discovery miss/put rates for task output caching."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 44
+              },
+              "id": 111,
+              "title": "K8s API Client Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "K8s requests/s",
+                  "refId": "A"
+                }
+              ],
+              "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "s"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 44
+              },
+              "id": 112,
+              "title": "K8s API Client Latency (p90)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "Request p90",
+                  "refId": "A"
+                },
+                {
+                  "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "Rate limiter p90",
+                  "refId": "B"
+                }
+              ],
+              "description": "K8s API request latency and client-side rate limiter wait time at p90."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 34
+          },
+          "id": 400,
+          "title": "gRPC Client (DP \u2192 CP)",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 13
+              },
+              "id": 401,
+              "title": "gRPC Client Request Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))",
+                  "legendFormat": "{{ grpc_service }}/{{ grpc_method }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "reqps"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 13
+              },
+              "id": 402,
+              "title": "gRPC Client Error Rate",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))",
+                  "legendFormat": "{{ grpc_method }} {{ grpc_code }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "s"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 24,
+                "x": 0,
+                "y": 21
+              },
+              "id": 403,
+              "title": "gRPC Client Latency (p95)",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+                  "legendFormat": "{{ grpc_method }} p95",
+                  "refId": "A"
+                }
+              ],
+              "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues."
+            }
+          ]
+        },
+        {
+          "collapsed": true,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 35
+          },
+          "id": 1100,
+          "title": "Infrastructure",
+          "type": "row",
+          "panels": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 14
+              },
+              "id": 1101,
+              "title": "CPU Usage by Service",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))",
+                  "legendFormat": "{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "line",
+                    "fillOpacity": 10,
+                    "lineWidth": 1,
+                    "showPoints": "never",
+                    "stacking": {
+                      "mode": "normal"
+                    }
+                  },
+                  "unit": "bytes"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 14
+              },
+              "id": 1102,
+              "title": "Memory Usage by Service",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})",
+                  "legendFormat": "{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Working set memory per container, stacked. Watch for approaching limits."
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${datasource}"
+              },
+              "fieldConfig": {
+                "defaults": {
+                  "color": {
+                    "mode": "palette-classic"
+                  },
+                  "custom": {
+                    "drawStyle": "bars",
+                    "fillOpacity": 80,
+                    "lineWidth": 1,
+                    "showPoints": "never"
+                  },
+                  "unit": "short"
+                }
+              },
+              "gridPos": {
+                "h": 8,
+                "w": 24,
+                "x": 0,
+                "y": 22
+              },
+              "id": 1103,
+              "title": "Pod Restart Count by Container",
+              "type": "timeseries",
+              "targets": [
+                {
+                  "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])",
+                  "legendFormat": "{{ pod }}/{{ container }}",
+                  "refId": "A"
+                }
+              ],
+              "description": "Per-container restart events. Spikes indicate crashes or OOM kills."
+            }
+          ]
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": [
+        "union",
+        "dataplane"
+      ],
+      "templating": {
+        "list": [
+          {
+            "current": {},
+            "hide": 0,
+            "includeAll": false,
+            "label": "Data Source",
+            "multi": false,
+            "name": "datasource",
+            "options": [],
+            "query": "prometheus",
+            "refresh": 1,
+            "type": "datasource"
+          },
+          {
+            "current": {
+              "selected": true,
+              "text": "union",
+              "value": "union"
+            },
+            "hide": 2,
+            "label": "Namespace",
+            "name": "namespace",
+            "options": [
+              {
+                "selected": true,
+                "text": "union",
+                "value": "union"
+              }
+            ],
+            "query": "union",
+            "type": "constant"
+          }
+        ]
+      },
+      "time": {
+        "from": "now-3h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "browser",
+      "title": "Union Dataplane Overview",
+      "uid": "union-dp-overview",
+      "version": 1
+    }
+---
+# Source: dataplane/templates/nodeexecutor/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+data:
+  task_logs.yaml: | 
+    plugins:
+      logs:
+        cloudwatch-enabled: false
+        dynamic-log-links:
+        - vscode:
+            displayName: VS Code Debugger
+            linkType: ide
+            templateUris:
+            - /dataplane/pod/v1/generated_name/6060/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/union-test/{{.namespace}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/{{.generatedName}}/
+        - wandb-execution-id:
+            displayName: Weights & Biases
+            linkType: dashboard
+            templateUris:
+            - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project
+              }}/runs/{{ .podName }}'
+        - wandb-custom-id:
+            displayName: Weights & Biases
+            linkType: dashboard
+            templateUris:
+            - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project
+              }}/runs/{{ .taskConfig.id }}'
+        - comet-ml-execution-id:
+            displayName: Comet
+            linkType: dashboard
+            templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{
+              .taskConfig.project_name }}/{{ .executionName }}{{ .nodeId }}{{
+              .taskRetryAttempt }}{{ .taskConfig.link_suffix }}'
+        - comet-ml-custom-id:
+            displayName: Comet
+            linkType: dashboard
+            templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{
+              .taskConfig.project_name }}/{{ .taskConfig.experiment_key }}'
+        - neptune-scale-run:
+            displayName: Neptune Run
+            linkType: dashboard
+            templateUris:
+            - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{
+              .podName }}
+        - neptune-scale-custom-id:
+            displayName: Neptune Run
+            linkType: dashboard
+            templateUris:
+            - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{
+              .taskConfig.id }}
+        kubernetes-enabled: true
+  enabled_plugins.yaml: |
+    plugins:
+      connector-service:
+        defaultConnector:
+          defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}'
+          endpoint: k8s:///flyteconnector.union:8000
+    tasks:
+      task-plugins:
+        default-for-task-types:
+          actor: fast-task
+          container: container
+          container_array: k8s-array
+          sidecar: sidecar
+        enabled-plugins:
+        - container
+        - sidecar
+        - k8s-array
+        - echo
+        - fast-task
+        - connector-service
+  config.yaml: |
+    executor:
+      cluster: 'union-test'
+      evaluatorCount: 64
+      maxActions: 2000
+      organization: 'union'
+      unionAuth:
+        injectSecret: true
+        secretName: EAGER_API_KEY
+      workerName: worker1
+      task_resources:
+        defaults:
+          cpu: 100m
+          memory: 500Mi
+        limits:
+          cpu: 4096
+          gpu: 256
+          memory: 2Ti
+    union:
+      connection:
+        host: dns:///union.test.union.ai
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    authorizer:
+      type: noop
+    catalog-cache:
+      cache-endpoint: dns:///union.test.union.ai
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+      type: fallback
+      use-admin-auth: true
+    logger:
+      level: 4
+      show-source: true
+    sharedService:
+      metrics:
+        scope: 'executor:'
+      security:
+        allowCors: true
+        allowLocalhostAccess: true
+        allowedHeaders:
+        - Content-Type
+        allowedOrigins:
+        - '*'
+        secure: false
+        useAuth: false
+    propeller:
+      node-config:
+        disable-input-file-writes: true
+    plugins:
+      fasttask:
+        additional-worker-args:
+        - --last-ack-grace-period-seconds
+        - "120"
+        callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605
+        grace-period-status-not-found: 2m
+      ioutils:
+        remoteFileOutputPaths:
+          deckFilename: report.html
+      k8s:
+        disable-inject-owner-references: true
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+        co-pilot:
+          image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1'
+          name: flyte-copilot-
+          start-timeout: 30s
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+---
+# Source: dataplane/templates/operator/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-operator
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+data:
+  k8s.yaml: | 
+    plugins:
+      k8s:
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+  config.yaml: |
+    union:
+      connection:
+        host: dns:///union.test.union.ai
+      auth:
+        authorizationMetadataKey: flyte-authorization
+        clientId: 'test-client'
+        clientSecretLocation: /etc/union/secret/client_secret
+        tokenRefreshWindow: 5m
+        type: ClientSecret
+    sharedService:
+      features:
+        gatewayV2: true
+      port: 8081
+    authorizer:
+      type: noop
+    operator:
+      enabled: true
+      enableTunnelService: true
+      tunnel:
+        enableDirectToAppIngress: false
+        deploymentToRestart: union-operator-proxy
+      apps:
+        enabled: 'false'
+      syncClusterConfig:
+        enabled: false
+      clusterId:
+        organization: 'union'
+      clusterData:
+        appId: 'test-client'
+        bucketName: 'test-bucket'
+        bucketRegion: 'us-east-1'
+        cloudHostName: 'union.test.union.ai'
+        gcpProjectId: ''
+        metadataBucketPrefix: 's3://test-bucket'
+        userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role'
+        userRoleKey: 'eks.amazonaws.com/role-arn'
+      collectUsages:
+        enabled: true
+      billing:
+        model: Legacy
+      dependenciesHeartbeat:
+        prometheus:
+          endpoint: 'http://union-operator-prometheus:80/-/healthy'
+        propeller:
+          endpoint: 'http://flytepropeller:10254'
+        proxy:
+          endpoint: 'http://union-operator-proxy:10254'
+      imageBuilder:
+        enabled: true
+        executionNamespaceLabels:
+          union.ai/namespace-type: flyte
+        referenceConfigmapName: union-operator
+        targetConfigMapName: "build-image-config"
+    proxy:
+      imageBuilderConfig:
+        authenticationType: 'noop'
+        defaultRepository: ''
+      persistedLogs:
+        objectStore:
+          pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}}
+          prefix: persisted-logs
+        sourceType: ObjectStore
+      smConfig:
+        enabled: 'true'
+        k8sConfig:
+          namespace: 'union'
+        type: 'K8s'
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+  config-overrides.yaml: | 
+    cache:
+      identity:
+        enabled: false
+  storage.yaml: | 
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+  fast_registration_storage.yaml: | 
+    fastRegistrationStorage:
+      container: ""
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+  image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234"
+  image-builder.default-repository: ""
+  image-builder.authentication-type: "noop"
+---
+# Source: dataplane/templates/prometheus/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+    alerting:
+      alertmanagers:
+      - static_configs:
+        - targets:
+    rule_files:
+      - rules.yml
+    scrape_configs:
+      # Self-monitoring
+      - job_name: prometheus
+        metrics_path: /prometheus/metrics
+        static_configs:
+        - targets: ['localhost:9090']
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total
+          action: keep
+
+      # Kube state metrics for pod/node resource tracking and cost calculations
+      - job_name: kube-state-metrics
+        static_configs:
+        - targets: ['release-name-kube-state-metrics:8080']
+        metric_relabel_configs:
+        - separator: ;
+          source_labels: [__name__]
+          regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total
+          action: keep
+        - separator: ;
+          source_labels: [__name__, phase]
+          regex: kube_pod_status_phase;(Succeeded|Failed)
+          action: drop
+        - source_labels: [node]
+          target_label: nodename
+          regex: '(.*)'
+          action: replace
+        - source_labels: [label_node_group_name]
+          action: replace
+          regex: (.+)
+          target_label: label_node_pool_name
+
+      # cAdvisor container metrics for CPU and memory tracking
+      - job_name: kubernetes-cadvisor
+        metrics_path: /metrics
+        scheme: https
+        kubernetes_sd_configs:
+        - role: node
+          namespaces:
+            names: []
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+          insecure_skip_verify: false
+        metric_relabel_configs:
+        - separator: ;
+          source_labels: [__name__]
+          regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes
+          action: keep
+        relabel_configs:
+        - separator: ;
+          regex: __meta_kubernetes_node_label_(.+)
+          replacement: $1
+          action: labelmap
+        - separator: ;
+          regex: (.*)
+          target_label: __address__
+          replacement: kubernetes.default.svc:443
+          action: replace
+        - source_labels: [__meta_kubernetes_node_name]
+          separator: ;
+          regex: (.+)
+          target_label: __metrics_path__
+          replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+          action: replace
+
+      # Flyte propeller metrics for execution info and fast task duration
+      - job_name: flytepropeller
+        kubernetes_sd_configs:
+        - role: pod
+          namespaces:
+            names:
+            - union
+          selectors:
+          - role: pod
+            label: app.kubernetes.io/name=flytepropeller
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration"
+          action: keep
+        relabel_configs:
+        - source_labels: [__meta_kubernetes_pod_container_name]
+          regex: flytepropeller
+          action: keep
+        - source_labels: [__meta_kubernetes_namespace]
+          action: replace
+          target_label: namespace
+        - source_labels: [__meta_kubernetes_pod_name]
+          action: replace
+          target_label: pod
+      # OpenCost metrics for cost tracking
+      - job_name: opencost
+        static_configs:
+        - targets: ['release-name-opencost:9003']
+        metric_relabel_configs:
+        - source_labels: [__name__]
+          regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost"
+          action: keep
+  rules.yml: |
+    
+    groups:
+      - name: cost_calculations_15s
+        interval: 15s
+        rules:
+          - record: pod_gpu_allocation
+            expr: |
+              sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1))
+          - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions
+                          "label_entity_id", "$1", "execution_id", "(.*)" # join key
+                        ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id
+                      ),
+                      "label_execution_id", "$1", "execution_id", "(.*)"
+                    ),
+                    "label_project", "$1", "project", "(.*)" # project
+                  ),
+                  "label_domain", "$1", "domain", "(.*)" # domain
+                )
+              )
+          - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      kube_pod_labels{
+                        label_domain!="",
+                        label_project!="",
+                        label_serving_unionai_dev_app_name!="",
+                        label_serving_knative_dev_revision!=""
+                      }, # this filters for apps
+                      "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup
+                    ),
+                    "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity)
+                  ),
+                  "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key
+                )
+              )
+          - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces.
+            expr: |
+              max by (label_domain, label_project, label_workspace_name, label_entity_id)(
+                label_replace(
+                  label_replace(
+                    kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces
+                    "label_entity_id", "$1", "label_node_id", "(.*)" # join key
+                  ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels
+                )
+              )
+          - record: fast_task_execution_duration
+            expr: |
+              max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)(
+                label_replace(
+                  label_replace(
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""},
+                            "label_entity_id", "$1", "execution_id", "(.*)" # join key
+                          ),
+                          "label_execution_id", "$1", "execution_id", "(.*)"
+                        ),
+                        "label_project", "$1", "project", "(.*)" # project
+                      ),
+                      "label_domain", "$1", "domain", "(.*)" # domain
+                    ),
+                    "namespace", "$1", "exported_namespace", "(.*)"
+                  ),
+                  "pod", "$1", "exported_pod", "(.*)"
+                )
+              )
+          - record: fast_task_execution_duration_rate
+            expr: |
+              irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration
+          - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity
+                # First, calculate the allocated memory for each pod
+                max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory
+                  (
+                    sum by (namespace, pod) (
+                      container_memory_working_set_bytes{namespace!="",pod!="",image!=""}
+                    )
+                    > sum by (namespace, pod) (
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"}
+                    )
+                  )
+                  or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory
+                    kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe
+                  )
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, calculate the allocated cpu for each pod
+                max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu
+                  (
+                    sum by (namespace, pod) (
+                      irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m])
+                    )
+                    > sum by (namespace, pod) (
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"}
+                    )
+                  )
+                  or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu
+                      kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"}
+                  )
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory)
+                max by (namespace, pod) (
+                  pod_gpu_allocation
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+                # Now join in node identifiers which are used for subsequent overhead calculations
+                * on (namespace, pod) group_left(node) (
+                  max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe
+                )
+              )
+          - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity
+                # First, calculate the used memory for each pod
+                sum by (namespace, pod) (
+                  container_memory_working_set_bytes{namespace!="",pod!="",image!=""}
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node)
+                entity_id:mem_usage_bytes_total_per_node:sum
+              )
+          - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) (
+                sum by (namespace, pod) (
+                  irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m])
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node)
+                entity_id:cpu_usage_per_node:sum
+              )
+          - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity
+            expr: |
+              avg by (label_entity_type, label_domain, label_project, label_entity_id) (
+                # First, grab the SM occupancy for each pod
+                max by (namespace, pod) (
+                  DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level)
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, node) (
+                # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory)
+                max by (namespace, pod) (
+                  pod_gpu_allocation
+                )
+                # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save
+                * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) (
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps)
+                            "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow")
+                          ),
+                          "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels
+                    label_replace(
+                      label_replace(
+                        kube_pod_labels{
+                          label_domain!="",
+                          label_project!="",
+                          label_serving_unionai_dev_app_name!="",
+                          label_serving_knative_dev_revision!=""
+                          }, # this filters for apps only
+                        "label_entity_type", "app", "", "" # set label_entity_type to "app"
+                      ),
+                      "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions)
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels
+                    label_replace(
+                      label_replace(
+                        label_replace(
+                          label_replace(
+                            kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps)
+                            "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace"
+                          ),
+                          "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key)
+                        ),
+                        "label_domain", "$1", "label_domain", "(.*)"
+                      ),
+                      "label_project", "$1", "label_project", "(.*)"
+                    )
+                  )
+                  or
+                  max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels
+                    label_replace(
+                      fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""},
+                      "label_entity_type", "fast_task", "", ""
+                    )
+                  )
+                )
+                # Then filter for pods only in the "Running" or "Pending" phase
+                * on (namespace, pod) group_left() (
+                  max by (namespace, pod) (
+                    kube_pod_status_phase{phase=~"Running|Pending"} == 1
+                  )
+                )
+              )
+          - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs)
+            expr: |
+              entity_id:sm_occupancy:avg
+              * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum
+          - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type) (
+                entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type)(
+                entity_id:cpu_usage_per_node:sum
+                * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app.
+            expr: |
+              sum by (label_entity_type, label_domain, label_project, label_entity_id, type)(
+                entity_id:gpu_usage_per_node:sum
+                * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+              )
+          - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app.
+            expr: |
+              label_replace(
+                sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                  entity_id:allocated_mem_cost:sum
+                  or
+                  entity_id:allocated_cpu_cost:sum
+                  or
+                  entity_id:allocated_gpu_cost:sum
+                ),
+                "type", "allocated", "", "" # add type info
+              )
+          - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app)
+            expr: |
+              label_replace(
+                sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id)
+                  # Start with each execution's and app's allocated cost per node
+                  sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                    entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                    * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                    or
+                    entity_id:cpu_usage_per_node:sum
+                    * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                    or
+                    entity_id:gpu_usage_per_node:sum
+                    * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                  )
+                  # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity
+                  / on (node) group_left()(
+                    sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                      entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                      * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                      or
+                      entity_id:cpu_usage_per_node:sum
+                      * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                      or
+                      entity_id:gpu_usage_per_node:sum
+                      * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                    )
+                    > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts
+                  )
+                  # Then multiply by the overhead cost per node
+                  * on (node) group_left() (
+                    # To calculate overhead, start with the true cost of running each node
+                    avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes
+                    * on (node) max by (node) (
+                      node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts
+                    ) * (15 / 3600) # convert hourly cost to 15-secondly cost
+                    # Then subtract out the total allocated cost on each node
+                    - on (node) group_left()(
+                      sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label)
+                        entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB
+                        * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type
+                        or
+                        entity_id:cpu_usage_per_node:sum
+                        * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                        or
+                        entity_id:gpu_usage_per_node:sum
+                        * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type
+                      )
+                    )
+                  )
+                ),
+                "type", "overhead", "", "" # add type info
+              )
+          - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs)
+            expr: |
+              label_replace(
+                sum by (label_domain, label_project, label_entity_id, label_entity_type) (
+                  entity_id:allocated_cost:sum
+                  or
+                  entity_id:overhead_cost:sum
+                ),
+                "type", "total", "", "" # add type info
+              )
+          - record: node:total_cost:sum # Total cost of all nodes
+            expr: |
+              sum (
+                avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes
+                * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost
+              )
+          - record: node_type:total_cost:sum # Total cost of nodes grouped by node type
+            expr: |
+              sum by (node_type)(
+                avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes
+                * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label
+              )
+          - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type
+            expr: |
+              sum by (node_type)(
+                avg by (node, node_type)( # dedupe
+                  label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel
+                )
+              ) * (15 / 3600) # convert to number of hours per 15-second observation      # Aggregate the above into visible metrics
+      - name: cost_rollup_15m
+        interval: 15m
+        rules:
+          - record: execution_info15m
+            expr: |
+              max_over_time(execution_info[15m:15s])
+          - record: app_info15m
+            expr: |
+              max_over_time(app_info[15m:15s])
+          - record: workspace_info15m
+            expr: |
+              max_over_time(workspace_info[15m:15s])
+          - record: entity_id:allocated_mem_bytes:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s])
+          - record: entity_id:used_mem_bytes:sum15m
+            expr: |
+              sum_over_time(entity_id:used_mem_bytes:sum[15m:15s])
+          - record: entity_id:allocated_cpu:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cpu:sum[15m:15s])
+          - record: entity_id:used_cpu:sum15m
+            expr: |
+              sum_over_time(entity_id:used_cpu:sum[15m:15s])
+          - record: entity_id:weighted_sm_occupancy:sum15m
+            expr: |
+              sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s])
+          - record: entity_id:gpu_count:sum15m
+            expr: |
+              sum_over_time(entity_id:gpu_count:sum[15m:15s])
+          - record: entity_id:allocated_mem_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s])
+          - record: entity_id:allocated_cpu_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s])
+          - record: entity_id:allocated_gpu_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s])
+          - record: entity_id:allocated_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:allocated_cost:sum[15m:15s])
+          - record: entity_id:overhead_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:overhead_cost:sum[15m:15s])
+          - record: entity_id:total_cost:sum15m
+            expr: |
+              sum_over_time(entity_id:total_cost:sum[15m:15s])
+          - record: node:total_cost:sum15m
+            expr: |
+              sum_over_time(node:total_cost:sum[15m:15s])
+          - record: node_type:total_cost:sum15m
+            expr: |
+              sum_over_time(node_type:total_cost:sum[15m:15s])
+          - record: node_type:uptime_hours:sum15m
+            expr: |
+              sum_over_time(node_type:uptime_hours:sum[15m:15s])
+---
+# Source: dataplane/templates/propeller/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: flyte-propeller-config
+  namespace: union
+data:
+  admin.yaml: | 
+    admin:
+      clientId: 'test-client'
+      clientSecretLocation: /etc/union/secret/client_secret
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+    event:
+      capacity: 1000
+      rate: 500
+      type: admin
+  catalog.yaml: | 
+    catalog-cache:
+      cache-endpoint: dns:///union.test.union.ai
+      endpoint: dns:///union.test.union.ai
+      insecure: false
+      type: fallback
+      use-admin-auth: true
+  copilot.yaml: | 
+    plugins:
+      k8s:
+        co-pilot:
+          image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1'
+          name: flyte-copilot-
+          start-timeout: 30s
+  core.yaml: | 
+    propeller:
+      downstream-eval-duration: 30s
+      enable-admin-launcher: true
+      leader-election:
+        enabled: true
+        lease-duration: 15s
+        lock-config-map:
+          name: propeller-leader
+          namespace: 'union'
+        renew-deadline: 10s
+        retry-period: 2s
+      limit-namespace: all
+      literal-offloading-config:
+        enabled: true
+      max-workflow-retries: 30
+      metadata-prefix: metadata/propeller
+      metrics-prefix: flyte
+      prof-port: 10254
+      queue:
+        batch-size: -1
+        batching-interval: 2s
+        queue:
+          base-delay: 5s
+          capacity: 1000
+          max-delay: 120s
+          rate: 100
+          type: maxof
+        sub-queue:
+          capacity: 100
+          rate: 10
+          type: bucket
+        type: batch
+      rawoutput-prefix: 's3://test-bucket'
+      workers: 4
+      workflow-reeval-duration: 30s
+    webhook:
+      certDir: /etc/webhook/certs
+      embeddedSecretManagerConfig:
+        imagePullSecrets:
+          enabled: true
+        k8sConfig:
+          namespace: 'union'
+        type: 'K8s'
+      listenPort: '9443'
+      secretManagerTypes:
+      - Embedded
+      - K8s
+      serviceName: flyte-pod-webhook
+      servicePort: '443'
+  enabled_plugins.yaml: | 
+    plugins:
+      connector-service:
+        defaultConnector:
+          defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}'
+          endpoint: k8s:///flyteconnector.union:8000
+    tasks:
+      task-plugins:
+        default-for-task-types:
+          actor: fast-task
+          container: container
+          container_array: k8s-array
+          sidecar: sidecar
+        enabled-plugins:
+        - container
+        - sidecar
+        - k8s-array
+        - echo
+        - fast-task
+        - connector-service
+  k8s.yaml: | 
+    plugins:
+      k8s:
+        default-cpus: 100m
+        default-env-vars: []
+        default-memory: 100Mi
+  logger.yaml: |
+    logger:
+      level: 4
+      show-source: true
+  resource_manager.yaml: | 
+    propeller:
+      resourcemanager:
+        type: noop
+  task_logs.yaml: | 
+    plugins:
+      logs:
+        cloudwatch-enabled: false
+        dynamic-log-links:
+        - vscode:
+            displayName: VS Code Debugger
+            templateUris:
+            - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/
+        kubernetes-enabled: false
+        templates:
+        - displayName: Task Logs
+          scheme: TaskExecution
+          templateUris:
+          - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true
+  storage.yaml: | 
+    storage:
+      container: "test-bucket"
+      type: s3
+      connection:
+        auth-type: iam
+        region: us-east-1
+      enable-multicontainer: false
+      limits:
+        maxDownloadMBs: 1024
+      cache:
+        max_size_mbs: 0
+        target_gc_percent: 70
+---
+# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: release-name-fluentbit
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - namespaces
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/charts/kube-state-metrics/templates/role.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+rules:
+
+- apiGroups: ["certificates.k8s.io"]
+  resources:
+  - certificatesigningrequests
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - cronjobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - daemonsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - deployments
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - endpoints
+  verbs: ["list", "watch"]
+
+- apiGroups: ["autoscaling"]
+  resources:
+  - horizontalpodautoscalers
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "networking.k8s.io"]
+  resources:
+  - ingresses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - jobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["coordination.k8s.io"]
+  resources:
+  - leases
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - limitranges
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - mutatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - namespaces
+  verbs: ["list", "watch"]
+
+- apiGroups: ["networking.k8s.io"]
+  resources:
+  - networkpolicies
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - nodes
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumeclaims
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumes
+  verbs: ["list", "watch"]
+
+- apiGroups: ["policy"]
+  resources:
+    - poddisruptionbudgets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - pods
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - replicasets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - replicationcontrollers
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - resourcequotas
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - secrets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - services
+  verbs: ["list", "watch"]
+
+- apiGroups: ["apps"]
+  resources:
+  - statefulsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - storageclasses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - validatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - volumeattachments
+  verbs: ["list", "watch"]
+---
+# Source: dataplane/charts/opencost/templates/clusterrole.yaml
+# Cluster role giving opencost to get, list, watch required resources
+# No write permissions are required
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: release-name-opencost
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups: [""]
+    resources:
+      - configmaps
+      - deployments
+      - nodes
+      - pods
+      - services
+      - resourcequotas
+      - replicationcontrollers
+      - limitranges
+      - persistentvolumeclaims
+      - persistentvolumes
+      - namespaces
+      - endpoints
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - extensions
+    resources:
+      - daemonsets
+      - deployments
+      - replicasets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - apps
+    resources:
+      - statefulsets
+      - deployments
+      - daemonsets
+      - replicasets
+    verbs:
+      - list
+      - watch
+  - apiGroups:
+      - batch
+    resources:
+      - cronjobs
+      - jobs
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - autoscaling
+    resources:
+      - horizontalpodautoscalers
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - policy
+    resources:
+      - poddisruptionbudgets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - storage.k8s.io
+    resources:
+      - storageclasses
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-clustersync-resource
+rules:
+  - apiGroups:
+      - ""
+      - rbac.authorization.k8s.io
+    resources:
+      - configmaps
+      - namespaces
+      - pods
+      - resourcequotas
+      - roles
+      - rolebindings
+      - secrets
+      - services
+      - serviceaccounts
+      - clusterrolebindings
+      - podtemplates
+    verbs:
+      - '*'
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-executor
+  labels:
+    app: executor
+rules:
+# Allow RO access to PODS
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
+# Allow Event recording access
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - update
+  - delete
+  - patch
+# Allow Access All plugin objects
+- apiGroups:
+  - '*'
+  resources:
+  - '*'
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - delete
+  - patch
+# Allow Access to CRD
+- apiGroups:
+  - apiextensions.k8s.io
+  resources:
+  - customresourcedefinitions
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - delete
+  - update
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - events
+      - flyteworkflows
+      - pods/log
+      - pods
+      - rayjobs
+      - resourcequotas
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  # Allow Access to all resources under flyte.lyft.com
+  - apiGroups:
+      - flyte.lyft.com
+    resources:
+      - flyteworkflows
+      - flyteworkflows/finalizers
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+      - post
+      - deletecollection
+  - apiGroups:
+      - '*'
+    resources:
+      - resourcequotas
+      - pods
+      - configmaps
+      - podtemplates
+      - secrets
+      - namespaces
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+  - nonResourceURLs:
+      - /metrics
+    verbs:
+      - get
+---
+# Source: dataplane/templates/prometheus/rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: union-operator-prometheus
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - pods
+      - endpoints
+      - services
+    verbs:
+      - get
+      - list
+      - watch
+  - nonResourceURLs:
+      - /metrics
+      - /metrics/cadvisor
+    verbs:
+      - get
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: flytepropeller-webhook-role
+  namespace: union
+rules:
+  - apiGroups:
+      - "*"
+    resources:
+      - mutatingwebhookconfigurations
+      - secrets
+      - pods
+      - replicasets/finalizers
+    verbs:
+      - get
+      - create
+      - update
+      - patch
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: flytepropeller-role
+rules:
+  # Allow RO access to PODS
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+  # Allow Event recording access
+  - apiGroups:
+      - ""
+    resources:
+      - events
+    verbs:
+      - create
+      - update
+      - delete
+      - patch
+  # Allow Access All plugin objects
+  - apiGroups:
+      - '*'
+    resources:
+      - '*'
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+  # Allow Access to CRD
+  - apiGroups:
+      - apiextensions.k8s.io
+    resources:
+      - customresourcedefinitions
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - delete
+      - update
+  # Allow Access to all resources under flyte.lyft.com
+  - apiGroups:
+      - flyte.lyft.com
+    resources:
+      - flyteworkflows
+      - flyteworkflows/finalizers
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+      - delete
+      - patch
+      - post
+      - deletecollection
+---
+# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: release-name-fluentbit
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-fluentbit
+subjects:
+  - kind: ServiceAccount
+    name: fluentbit-system
+    namespace: union
+---
+# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  name: release-name-kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-kube-state-metrics
+subjects:
+- kind: ServiceAccount
+  name: release-name-kube-state-metrics
+  namespace: union
+---
+# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: release-name-opencost
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: release-name-opencost
+subjects:
+  - kind: ServiceAccount
+    name: release-name-opencost
+    namespace: union
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-clustersync-resource
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-clustersync-resource
+subjects:
+  - kind: ServiceAccount
+    name: union-clustersync-system
+    namespace: union
+---
+# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-clustersync-auth-delegator
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: system:auth-delegator
+subjects:
+  - kind: ServiceAccount
+    name: union-clustersync-system
+    namespace: union
+---
+# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-executor
+  labels:
+    app: executor
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-executor
+subjects:
+- kind: ServiceAccount
+  name: executor
+  namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: proxy-system
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: proxy-system
+subjects:
+  - kind: ServiceAccount
+    name: proxy-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: operator-system
+subjects:
+  - kind: ServiceAccount
+    name: operator-system
+    namespace: union
+---
+# Source: dataplane/templates/prometheus/rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: union-operator-prometheus
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: union-operator-prometheus
+subjects:
+  - kind: ServiceAccount
+    name: union-operator-prometheus
+    namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml
+# Create a binding from Role -> ServiceAccount
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: flytepropeller-webhook-binding
+  namespace: union
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: flytepropeller-webhook-role
+subjects:
+  - kind: ServiceAccount
+    name: flytepropeller-webhook-system
+    namespace: union
+---
+# Source: dataplane/templates/propeller/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: flytepropeller-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: flytepropeller-role
+subjects:
+  - kind: ServiceAccount
+    name: flytepropeller-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: proxy-system-secret
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - secrets
+    verbs:
+      - get
+      - list
+      - create
+      - update
+      - delete
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - '*'
+    resources:
+      - secrets
+      - deployments
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+      - update
+---
+# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: proxy-system-secret
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: proxy-system-secret
+subjects:
+  - kind: ServiceAccount
+    name: proxy-system
+    namespace: union
+---
+# Source: dataplane/templates/operator/serviceaccount.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: operator-system
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: operator-system
+subjects:
+  - kind: ServiceAccount
+    name: operator-system
+    namespace: union
+---
+# Source: dataplane/charts/fluentbit/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-fluentbit
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 2020
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/kube-state-metrics/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-kube-state-metrics
+  namespace: union
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+  annotations:
+    prometheus.io/scrape: 'true'
+spec:
+  type: "ClusterIP"
+  ports:
+  - name: "http"
+    protocol: TCP
+    port: 8080
+    targetPort: 8080
+  
+  selector:    
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/opencost/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+  type: "ClusterIP"
+  ports:
+    - name: http
+      port: 9003
+      targetPort: 9003
+---
+# Source: dataplane/templates/clusterresourcesync/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: syncresources
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+      app.kubernetes.io/name: clusterresourcesync
+      app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/flyteconnector/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  clusterIP: None
+  ports:
+    - name: grpc
+      port: 8000
+      protocol: TCP
+      targetPort: grpc
+    - name: metric
+      port: 9090
+      protocol: TCP
+      targetPort: metric
+  selector: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/imagebuilder/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-buildkit
+  labels:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 1234
+      targetPort: tcp
+      protocol: TCP
+      name: tcp
+  selector:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/nodeexecutor/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: release-name-dataplane-executor
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app: executor
+spec:
+  type: ClusterIP
+  ports:
+    - port: 15605
+      targetPort: 15605
+      protocol: TCP
+      name: fasttask
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app: executor
+---
+# Source: dataplane/templates/operator/service-proxy.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-proxy
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8080
+      targetPort: http
+      protocol: TCP
+      name: http
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/operator/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 10254
+      targetPort: debug
+      protocol: TCP
+      name: debug
+  selector:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/prometheus/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+spec:
+  type: ClusterIP
+  ports:
+    - port: 80
+      targetPort: 9090
+      protocol: TCP
+      name: http
+  selector:
+    app.kubernetes.io/component: prometheus
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/templates/propeller/service-webhook.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyte-pod-webhook
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+    projectcontour.io/upstream-protocol.h2c: grpc
+spec:
+  selector:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+  ports:
+    - name: https
+      protocol: TCP
+      port: 443
+      targetPort: 9443
+    - name: debug
+      protocol: TCP
+      port: 10254
+      targetPort: 10254
+---
+# Source: dataplane/templates/propeller/service-webhook.yaml
+# Headless Service for cache invalidation — resolves to all pod IPs so that
+# we can fan out invalidation requests to every webhook replica.
+apiVersion: v1
+kind: Service
+metadata:
+  name: flyte-pod-webhook-headless
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  clusterIP: None
+  selector:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+  ports:
+    - name: cache-internal
+      protocol: TCP
+      port: 9443
+      targetPort: 9443
+---
+# Source: dataplane/templates/propeller/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  namespace: union
+  name: flytepropeller
+  labels:
+    platform.union.ai/prometheus-group: "union-services"
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - name: debug
+      protocol: TCP
+      port: 10254
+    - name: fasttask
+      port: 15605
+      protocol: TCP
+      targetPort: 15605
+  selector:
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+---
+# Source: dataplane/charts/fluentbit/templates/daemonset.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: release-name-fluentbit
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/name: fluentbit
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fluentbit
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: fluentbit
+        app.kubernetes.io/instance: release-name
+      annotations:
+        checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+    spec:
+      serviceAccountName: fluentbit-system
+      hostNetwork: false
+      dnsPolicy: ClusterFirst
+      containers:
+        - name: fluentbit
+          image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8"
+          imagePullPolicy: IfNotPresent
+          command:
+            - /fluent-bit/bin/fluent-bit
+          args:
+            - --workdir=/fluent-bit/etc
+            - --config=/fluent-bit/etc/conf/fluent-bit.conf
+          ports:
+            - name: http
+              containerPort: 2020
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /
+              port: http
+          readinessProbe:
+            httpGet:
+              path: /api/v1/health
+              port: http
+          volumeMounts:
+            - name: config
+              mountPath: /fluent-bit/etc/conf
+            - mountPath: /var/log
+              name: varlog
+            - mountPath: /var/lib/docker/containers
+              name: varlibdockercontainers
+              readOnly: true
+            - mountPath: /etc/machine-id
+              name: etcmachineid
+              readOnly: true
+      volumes:
+        - name: config
+          configMap:
+            name: fluentbit-system
+        - hostPath:
+            path: /var/log
+          name: varlog
+        - hostPath:
+            path: /var/lib/docker/containers
+          name: varlibdockercontainers
+        - hostPath:
+            path: /etc/machine-id
+            type: File
+          name: etcmachineid
+      tolerations:
+        - operator: Exists
+---
+# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: release-name-kube-state-metrics
+  namespace: union
+  labels:    
+    helm.sh/chart: kube-state-metrics-5.30.1
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/part-of: kube-state-metrics
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2.15.0"
+spec:
+  selector:
+    matchLabels:      
+      app.kubernetes.io/name: kube-state-metrics
+      app.kubernetes.io/instance: release-name
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  revisionHistoryLimit: 10
+  template:
+    metadata:
+      labels:        
+        helm.sh/chart: kube-state-metrics-5.30.1
+        app.kubernetes.io/managed-by: Helm
+        app.kubernetes.io/component: metrics
+        app.kubernetes.io/part-of: kube-state-metrics
+        app.kubernetes.io/name: kube-state-metrics
+        app.kubernetes.io/instance: release-name
+        app.kubernetes.io/version: "2.15.0"
+    spec:
+      automountServiceAccountToken: true
+      hostNetwork: false
+      serviceAccountName: release-name-kube-state-metrics
+      securityContext:
+        fsGroup: 65534
+        runAsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - name: kube-state-metrics
+        args:
+        - --port=8080
+        - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
+        imagePullPolicy: IfNotPresent
+        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0
+        ports:
+        - containerPort: 8080
+          name: "http"
+        livenessProbe:
+          failureThreshold: 3
+          httpGet:
+            httpHeaders:
+            path: /livez
+            port: 8080
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 5
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            httpHeaders:
+            path: /readyz
+            port: 8081
+            scheme: HTTP
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 5
+        resources:
+          {}
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          readOnlyRootFilesystem: true
+---
+# Source: dataplane/charts/opencost/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: release-name-opencost
+  namespace: union
+  labels:
+    helm.sh/chart: opencost-1.42.0
+    app.kubernetes.io/name: opencost
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "1.111.0"
+    app.kubernetes.io/part-of: opencost
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: opencost
+      app.kubernetes.io/instance: release-name
+  strategy: 
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 1
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: opencost
+        app.kubernetes.io/instance: release-name
+    spec:
+      serviceAccountName: release-name-opencost
+      containers:
+        - name: release-name-opencost
+          image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97
+          imagePullPolicy: IfNotPresent
+          args:
+          ports:
+            - containerPort: 9003
+              name: http
+          resources:
+            limits:
+              cpu: 1000m
+              memory: 4Gi
+            requests:
+              cpu: 500m
+              memory: 1Gi
+          startupProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            failureThreshold: 30
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 20
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 9003
+            initialDelaySeconds: 10
+            periodSeconds: 10
+            failureThreshold: 3
+          env:
+            - name: LOG_LEVEL
+              value: info
+            - name: CUSTOM_COST_ENABLED
+              value: "false"
+            - name: KUBECOST_NAMESPACE
+              value: union
+            - name: API_PORT
+              value: "9003"
+            - name: PROMETHEUS_SERVER_ENDPOINT
+              value: "http://union-operator-prometheus.union.svc:80/prometheus"
+            - name: CLUSTER_ID
+              value: "default-cluster"
+            - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS
+              value: "15"
+            - name: CLOUD_COST_ENABLED
+              value: "false"
+            - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL
+              value: "6"
+            - name: CLOUD_COST_REFRESH_RATE_HOURS
+              value: "6"
+            - name: CLOUD_COST_QUERY_WINDOW_DAYS
+              value: "7"
+            - name: CLOUD_COST_RUN_WINDOW_DAYS
+              value: "3"
+            # Add any additional provided variables
+---
+# Source: dataplane/templates/clusterresourcesync/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-syncresources
+  namespace: union
+  labels:
+    app.kubernetes.io/name: clusterresourcesync
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: clusterresourcesync
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "b511750d960c272bb6a4f3ddbbfd46cfcaf0f7dfa7c3e4348c14af517722b00"
+        
+      labels:
+        
+        app.kubernetes.io/name: clusterresourcesync
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      containers:
+        - command:
+            - clusterresource
+            - --config
+            - /etc/flyte/config/*.yaml
+            - clusterresource
+            - run
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          name: sync-cluster-resources
+          resources:
+            limits:
+              cpu: "1"
+              memory: 500Mi
+            requests:
+              cpu: 500m
+              memory: 100Mi
+          volumeMounts:
+            - name: auth
+              mountPath: /etc/union/secret
+            - name: resource-templates
+              mountPath: /etc/flyte/clusterresource/templates
+            - name: config-volume
+              mountPath: /etc/flyte/config
+          ports:
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+      serviceAccountName: union-clustersync-system
+      volumes:
+        - configMap:
+            name: union-clusterresource-template
+          name: resource-templates
+        - configMap:
+            name: union-clusterresourcesync-config
+          name: config-volume
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/flyteconnector/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flyteconnector
+  namespace: union
+  labels: 
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 2
+  selector:
+    matchLabels: 
+      app.kubernetes.io/name: flyteconnector
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+      labels: 
+        app.kubernetes.io/name: flyteconnector
+        app.kubernetes.io/instance: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      containers:
+        - command:
+            - flyte
+            - serve
+            - connector
+          image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122"
+          imagePullPolicy: "IfNotPresent"
+          name: flyteconnector
+          readinessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /
+              port: 9090
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 5
+          env:
+            - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT
+              value: "google-cloud-platform://"
+            - name: LOG_LEVEL
+              value: "10"
+          ports:
+            - containerPort: 8000
+              name: grpc
+            - containerPort: 9090
+              name: metric
+          resources:
+            limits:
+              cpu: "1.5"
+              ephemeral-storage: 100Mi
+              memory: 1500Mi
+            requests:
+              cpu: "1"
+              ephemeral-storage: 100Mi
+              memory: 1000Mi
+      serviceAccountName: flyteconnector
+      
+      nodeSelector:
+      
+        app_pool: connectors
+      tolerations:
+      
+        - effect: NoSchedule
+          key: connectors
+          operator: Exists
+---
+# Source: dataplane/templates/imagebuilder/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-buildkit
+  labels:
+    app.kubernetes.io/name: imagebuilder-buildkit
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  strategy:
+    type: Recreate
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: imagebuilder-buildkit
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        container.apparmor.security.beta.kubernetes.io/buildkit: unconfined
+      labels:
+        app.kubernetes.io/name: imagebuilder-buildkit
+        app.kubernetes.io/instance: release-name
+    spec:
+      serviceAccountName: "union-imagebuilder"
+      containers:
+        - name: "buildkit"
+          image: "docker.io/moby/buildkit:buildx-stable-1-rootless"
+          imagePullPolicy: IfNotPresent
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: GOMEMLIMIT
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.memory
+            - name: GOMAXPROCS
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.cpu
+            - name: CLUSTER_NAME
+              valueFrom:
+                secretKeyRef:
+                  name: operator-cluster-name
+                  key: cluster_name
+            - name: DEPLOYMENT_NAME
+              value: operator
+            - name: PROXY_SERVICE_URL
+              value: http://union-operator-proxy:8080
+            - name: PROMETHEUS_SERVICE_URL
+              value: http://union-operator-prometheus:80
+            - name: KNATIVE_PROXY_SERVICE_URL
+              value: http://kourier-internal
+          volumeMounts:
+            - mountPath: /home/user/.local/share/buildkit
+              name: buildkitd
+            - mountPath: /etc/buildkit
+              name: buildkit-config
+          args:
+            - --config
+            - /etc/buildkit/buildkitd.toml
+            - --addr
+            - unix:///run/user/1000/buildkit/buildkitd.sock
+            - --addr
+            - tcp://0.0.0.0:1234
+            - --oci-worker-no-process-sandbox
+          ports:
+            - name: tcp
+              containerPort: 1234
+              protocol: TCP
+          readinessProbe:
+            exec:
+              command:
+              - buildctl
+              - debug
+              - workers
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          livenessProbe:
+            exec:
+              command:
+              - buildctl
+              - debug
+              - workers
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          securityContext:
+            seccompProfile: # Needs Kubernetes >= 1.19
+              type: Unconfined
+            runAsUser: 1000
+            runAsGroup: 1000
+          resources:
+            requests:
+              cpu: 1
+              ephemeral-storage: 20Gi
+              memory: 1Gi
+      volumes:
+      - name: buildkitd
+        emptyDir: {}
+      - configMap:
+          name: union-operator-buildkit
+        name: buildkit-config
+      
+      nodeSelector:
+        app_pool: flyte
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchLabels:
+                app.kubernetes.io/name: imagebuilder-buildkit
+                app.kubernetes.io/instance: release-name
+            topologyKey: "kubernetes.io/hostname"
+      
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/nodeexecutor/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: executor
+  namespace: union
+  labels:
+    app: executor
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: executor
+  template:
+    metadata:
+      annotations:
+        configChecksum: "3d931e5636192b94c904aa780a60effc2bb71861f72f22b448e711b33d41918"
+        
+      labels:
+        
+        app: executor
+    spec:
+      securityContext:
+        fsGroup: 1337
+      serviceAccountName: executor
+      volumes:
+        - name: config-volume
+          configMap:
+            name: executor
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      containers:
+        - name: executor
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          command:
+            - executor
+            - serve
+            - --config
+            - /etc/config/*.yaml
+          ports:
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: GOMEMLIMIT
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.memory
+            - name: GOMAXPROCS
+              valueFrom:
+                resourceFieldRef:
+                  divisor: 1
+                  resource: limits.cpu
+            - name: CLUSTER_NAME
+              valueFrom:
+                secretKeyRef:
+                  name: operator-cluster-name
+                  key: cluster_name
+            - name: DEPLOYMENT_NAME
+              value: operator
+            - name: PROXY_SERVICE_URL
+              value: http://union-operator-proxy:8080
+            - name: PROMETHEUS_SERVICE_URL
+              value: http://union-operator-prometheus:80
+            - name: KNATIVE_PROXY_SERVICE_URL
+              value: http://kourier-internal
+          resources:
+            limits:
+              cpu:    "4"
+              memory: "8Gi"
+            requests:
+              cpu:    "1"
+              memory: "1Gi"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/config
+            - name: secret-volume
+              mountPath: /etc/union/secret
+            - name: auth
+              mountPath: /etc/secrets/
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/operator/deployment-proxy.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-proxy
+  namespace: union
+  labels:
+    app.kubernetes.io/name: operator-proxy
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: operator-proxy
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad"
+        
+      labels:
+        
+        app.kubernetes.io/name: operator-proxy
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      volumes:
+        - name: config-volume
+          projected:
+            sources:
+            - configMap:
+                name: union-operator
+            - configMap:
+                name: union-clusterresourcesync-config
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+      serviceAccountName: proxy-system
+      securityContext:
+        {}
+      containers:
+        - name: operator-proxy
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          terminationMessagePolicy: FallbackToLogsOnError
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: 500m
+              memory: 500Mi
+          volumeMounts:
+            - mountPath: /etc/union/config
+              name: config-volume
+            - mountPath: /etc/union/secret
+              name: secret-volume
+          args:
+            - operator
+            - proxy
+            - --config
+            - /etc/union/config/*.yaml
+          ports:
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: connect
+              containerPort: 8080
+              protocol: TCP
+            - name: grpc
+              containerPort: 8081
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+        - name: "tunnel"
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          args:
+            - cloudflared
+            - tunnel
+            - --no-autoupdate
+            - run
+            - --token
+            - $(TUNNEL_TOKEN)
+          env:
+            - name: TUNNEL_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: union-secret-auth
+                  key: tunnel_token
+                  optional: true
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: 500m
+              memory: 500Mi
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/operator/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator
+  labels:
+    app.kubernetes.io/name: union-operator
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: union-operator
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad"
+        
+      labels:
+        
+        app.kubernetes.io/name: union-operator
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      serviceAccountName: operator-system
+      securityContext:
+        {}
+      volumes:
+        - name: config-volume
+          configMap:
+            name: union-operator
+        - name: secret-volume
+          secret:
+            secretName: union-secret-auth
+      containers:
+        - name: operator
+          securityContext:
+            {}
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: IfNotPresent
+          terminationMessagePolicy: FallbackToLogsOnError
+          resources:
+            limits:
+              cpu: "2"
+              memory: 3Gi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          volumeMounts:
+            - mountPath: /etc/union/config
+              name: config-volume
+            - mountPath: /etc/union/secret
+              name: secret-volume
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          args:
+            - operator
+            - serve
+            - --config
+            - /etc/union/config/*.yaml
+            - --operator.clusterId.name
+            - "$(CLUSTER_NAME)"
+            - --operator.tunnel.k8sSecretName
+            - union-secret-auth
+          ports:
+            - name: grpc
+              containerPort: 8080
+              protocol: TCP
+            - name: http
+              containerPort: 8089
+              protocol: TCP
+            - name: debug
+              containerPort: 10254
+              protocol: TCP
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/prometheus/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: union-operator-prometheus
+  namespace: union
+  labels:
+    helm.sh/chart: dataplane-2026.3.12
+    app.kubernetes.io/name: release-name-dataplane
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/version: "2026.3.9"
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/component: prometheus
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: prometheus
+      app.kubernetes.io/name: release-name-dataplane
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "9505483b28e45abfefda9a9791a7719382b61225386ddfbdfea71a459a1423e"
+      labels:
+        app.kubernetes.io/component: prometheus
+        app.kubernetes.io/name: release-name-dataplane
+        app.kubernetes.io/instance: release-name
+    spec:
+      priorityClassName: system-cluster-critical
+      serviceAccountName: union-operator-prometheus
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+        fsGroupChangePolicy: OnRootMismatch
+      containers:
+        - name: prometheus
+          image: "prom/prometheus:v3.3.1"
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --web.external-url=/prometheus/
+            - --web.route-prefix=/prometheus/
+            - --storage.tsdb.retention.time=3d
+          ports:
+            - name: http
+              containerPort: 9090
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3500Mi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          securityContext:
+            allowPrivilegeEscalation: false
+          terminationMessagePolicy: FallbackToLogsOnError
+          volumeMounts:
+            - mountPath: /etc/prometheus
+              name: prometheus-config
+      volumes:
+        - name: prometheus-config
+          configMap:
+            name: union-operator-prometheus
+      
+      nodeSelector:
+      
+        app_pool: monitoring
+      tolerations:
+      
+        - effect: NoSchedule
+          key: monitoring
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/propeller/deployment-webhook.yaml
+# Create the actual deployment
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flytepropeller-webhook
+  namespace: union
+  labels:
+    app.kubernetes.io/name: flyte-pod-webhook
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: flyte-pod-webhook
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      labels:
+        
+        app.kubernetes.io/name: flyte-pod-webhook
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+      annotations:
+        configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e"
+        
+    spec:
+      securityContext:
+        fsGroup: 65534
+        fsGroupChangePolicy: Always
+        runAsNonRoot: true
+        runAsUser: 1001
+        seLinuxOptions:
+          type: spc_t
+      serviceAccountName: flytepropeller-webhook-system
+      initContainers:
+        - name: generate-secrets
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          command:
+            - flytepropeller
+          args:
+            - webhook
+            - init-certs
+            - --config
+            - /etc/flyte/config/*.yaml
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+          resources:
+            limits:
+              cpu: 1
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+            requests:
+              cpu: 200m
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+      containers:
+        - name: webhook
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          command:
+            - flytepropeller
+          args:
+            - webhook
+            - --config
+            - /etc/flyte/config/*.yaml
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          ports:
+            - containerPort: 9443
+            - containerPort: 10254
+          resources:
+            limits:
+              cpu: 1
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+            requests:
+              cpu: 200m
+              ephemeral-storage: 500Mi
+              memory: 500Mi
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+              readOnly: true
+            - name: webhook-certs
+              mountPath: /etc/webhook/certs
+              readOnly: true
+      volumes:
+        - name: config-volume
+          configMap:
+            name: flyte-propeller-config
+        - name: webhook-certs
+          secret:
+            secretName: flyte-pod-webhook
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/propeller/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: union
+  name: flytepropeller
+  labels:
+    app.kubernetes.io/name: flytepropeller
+    app.kubernetes.io/instance: release-name
+    platform.union.ai/service-group: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: flytepropeller
+      app.kubernetes.io/instance: release-name
+  template:
+    metadata:
+      annotations:
+        configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e"
+        
+      labels:
+        
+        
+        app.kubernetes.io/name: flytepropeller
+        app.kubernetes.io/instance: release-name
+        platform.union.ai/service-group: release-name
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      priorityClassName: system-cluster-critical
+      containers:
+        - command:
+            - flytepropeller
+            - --config
+            - /etc/flyte/config/*.yaml
+            - --propeller.cluster-id
+            - union-test
+          env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: GOMEMLIMIT
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.memory
+          - name: GOMAXPROCS
+            valueFrom:
+              resourceFieldRef:
+                divisor: 1
+                resource: limits.cpu
+          - name: CLUSTER_NAME
+            valueFrom:
+              secretKeyRef:
+                name: operator-cluster-name
+                key: cluster_name
+          - name: DEPLOYMENT_NAME
+            value: operator
+          - name: PROXY_SERVICE_URL
+            value: http://union-operator-proxy:8080
+          - name: PROMETHEUS_SERVICE_URL
+            value: http://union-operator-prometheus:80
+          - name: KNATIVE_PROXY_SERVICE_URL
+            value: http://kourier-internal
+          image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9"
+          imagePullPolicy: "IfNotPresent"
+          name: flytepropeller
+          ports:
+            - containerPort: 10254
+          resources:
+            limits:
+              cpu: "3"
+              memory: 3Gi
+            requests:
+              cpu: "1"
+              memory: 1Gi
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flyte/config
+            - name: auth
+              mountPath: /etc/union/secret
+      serviceAccountName: flytepropeller-system
+      volumes:
+        - configMap:
+            name: flyte-propeller-config
+          name: config-volume
+        - name: auth
+          secret:
+            secretName: union-secret-auth
+      
+      nodeSelector:
+        app_pool: flyte
+      tolerations:
+        - effect: NoSchedule
+          key: flyte
+          operator: Equal
+          value: "true"
+---
+# Source: dataplane/templates/flyteconnector/hpa.yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: flyteconnector
+  labels:
+    app.kubernetes.io/name: flyteconnector
+    app.kubernetes.io/instance: release-name
+    app.kubernetes.io/managed-by: Helm
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: flyteconnector
+  minReplicas: 2
+  maxReplicas: 5
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 80
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: 80
+---
+# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "release-name-fluentbit-test-connection"
+  namespace: union
+  labels:
+    helm.sh/chart: fluentbit-0.48.9
+    app.kubernetes.io/version: "3.2.8"
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+    helm.sh/hook: test
+    helm.sh/hook-delete-policy: hook-succeeded
+spec:
+  containers:
+    - name: wget
+      image: "busybox:latest"
+      imagePullPolicy: Always
+      command: ["sh"]
+      args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"]
+  restartPolicy: Never
diff --git a/tests/values/dataplane.global-scheduling.yaml b/tests/values/dataplane.global-scheduling.yaml
new file mode 100644
index 00000000..42d8aee4
--- /dev/null
+++ b/tests/values/dataplane.global-scheduling.yaml
@@ -0,0 +1,42 @@
+# Test that global scheduling (nodeSelector, tolerations, affinity) cascades
+# to all Union-owned components: propeller, executor, webhook, operator, proxy,
+# clusterresourcesync, prometheus, flyteconnector, and imagebuilder/buildkit.
+
+host: union.test.union.ai
+clusterName: union-test
+orgName: union
+provider: aws
+
+storage:
+  provider: aws
+  authType: iam
+  bucketName: test-bucket
+  region: us-east-1
+
+secrets:
+  admin:
+    create: true
+    clientSecret: test-secret
+    clientId: test-client
+
+# Enable flyteconnector to verify it also gets global scheduling
+flyteconnector:
+  enabled: true
+
+scheduling:
+  nodeSelector:
+    app_pool: flyte
+  tolerations:
+    - key: "flyte"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                  - flyte
diff --git a/tests/values/dataplane.scheduling-override.yaml b/tests/values/dataplane.scheduling-override.yaml
new file mode 100644
index 00000000..684b3d42
--- /dev/null
+++ b/tests/values/dataplane.scheduling-override.yaml
@@ -0,0 +1,47 @@
+# Test that per-service scheduling overrides take precedence over global scheduling.
+# Global scheduling sets app_pool: flyte, but prometheus overrides to app_pool: monitoring.
+
+host: union.test.union.ai
+clusterName: union-test
+orgName: union
+provider: aws
+
+storage:
+  provider: aws
+  authType: iam
+  bucketName: test-bucket
+  region: us-east-1
+
+secrets:
+  admin:
+    create: true
+    clientSecret: test-secret
+    clientId: test-client
+
+scheduling:
+  nodeSelector:
+    app_pool: flyte
+  tolerations:
+    - key: "flyte"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
+
+# Per-service overrides should win over global
+prometheus:
+  nodeSelector:
+    app_pool: monitoring
+  tolerations:
+    - key: "monitoring"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
+
+flyteconnector:
+  enabled: true
+  nodeSelector:
+    app_pool: connectors
+  tolerations:
+    - key: "connectors"
+      operator: "Exists"
+      effect: "NoSchedule"