diff --git a/.gitignore b/.gitignore index c16aa5fd..bbc721c9 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,8 @@ __pycache__/ *.tgz .claude/ +# Extracted subchart artifacts from helm dep update +/kube-prometheus-stack/ +/kube-state-metrics/ +/metrics-server/ + diff --git a/charts/dataplane/templates/_helpers.tpl b/charts/dataplane/templates/_helpers.tpl index b1f9d611..f0c777e9 100644 --- a/charts/dataplane/templates/_helpers.tpl +++ b/charts/dataplane/templates/_helpers.tpl @@ -910,6 +910,201 @@ nodeName: {{- toYaml . }} {{- end }} {{- end -}} +{{/* +Prometheus scheduling helpers +*/}} +{{- define "prometheus.scheduling.topologySpreadConstraints" -}} +{{- with .Values.prometheus.topologySpreadConstraints }} +topologySpreadConstraints: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "prometheus.scheduling.affinity" -}} +{{- with .Values.prometheus.affinity }} +affinity: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "prometheus.scheduling.nodeSelector" -}} +{{- with .Values.prometheus.nodeSelector }} +nodeSelector: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "prometheus.scheduling.nodeName" -}} +{{- with .Values.prometheus.nodeName }} +nodeName: {{ toYaml . }} +{{- end }} +{{- end }} + +{{- define "prometheus.scheduling.tolerations" -}} +{{- with .Values.prometheus.tolerations }} +tolerations: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "prometheus.scheduling" -}} +{{- if .Values.prometheus.topologySpreadConstraints }} +{{- include "prometheus.scheduling.topologySpreadConstraints" . }} +{{- else }} +{{- include "global.scheduling.topologySpreadConstraints" . }} +{{- end }} +{{- if .Values.prometheus.affinity }} +{{- include "prometheus.scheduling.affinity" . }} +{{- else }} +{{- include "global.scheduling.affinity" . }} +{{- end }} +{{- if .Values.prometheus.nodeSelector }} +{{- include "prometheus.scheduling.nodeSelector" . }} +{{- else }} +{{- include "global.scheduling.nodeSelector" . }} +{{- end }} +{{- if .Values.prometheus.nodeName }} +{{- include "prometheus.scheduling.nodeName" . }} +{{- else }} +{{- include "global.scheduling.nodeName" . }} +{{- end }} +{{- if .Values.prometheus.tolerations }} +{{- include "prometheus.scheduling.tolerations" . }} +{{- else }} +{{- include "global.scheduling.tolerations" . }} +{{- end }} +{{- end -}} + +{{/* +Flyteconnector scheduling helpers +*/}} +{{- define "flyteconnector.scheduling.topologySpreadConstraints" -}} +{{- with .Values.flyteconnector.topologySpreadConstraints }} +topologySpreadConstraints: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "flyteconnector.scheduling.affinity" -}} +{{- with .Values.flyteconnector.affinity }} +affinity: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "flyteconnector.scheduling.nodeSelector" -}} +{{- with .Values.flyteconnector.nodeSelector }} +nodeSelector: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "flyteconnector.scheduling.nodeName" -}} +{{- with .Values.flyteconnector.nodeName }} +nodeName: {{ toYaml . }} +{{- end }} +{{- end }} + +{{- define "flyteconnector.scheduling.tolerations" -}} +{{- with .Values.flyteconnector.tolerations }} +tolerations: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "flyteconnector.scheduling" -}} +{{- if .Values.flyteconnector.topologySpreadConstraints }} +{{- include "flyteconnector.scheduling.topologySpreadConstraints" . }} +{{- else }} +{{- include "global.scheduling.topologySpreadConstraints" . }} +{{- end }} +{{- if .Values.flyteconnector.affinity }} +{{- include "flyteconnector.scheduling.affinity" . }} +{{- else }} +{{- include "global.scheduling.affinity" . }} +{{- end }} +{{- if .Values.flyteconnector.nodeSelector }} +{{- include "flyteconnector.scheduling.nodeSelector" . }} +{{- else }} +{{- include "global.scheduling.nodeSelector" . }} +{{- end }} +{{- if .Values.flyteconnector.nodeName }} +{{- include "flyteconnector.scheduling.nodeName" . }} +{{- else }} +{{- include "global.scheduling.nodeName" . }} +{{- end }} +{{- if .Values.flyteconnector.tolerations }} +{{- include "flyteconnector.scheduling.tolerations" . }} +{{- else }} +{{- include "global.scheduling.tolerations" . }} +{{- end }} +{{- end -}} + +{{/* +Imagebuilder buildkit scheduling helpers +*/}} +{{- define "imagebuilder.buildkit.scheduling.topologySpreadConstraints" -}} +{{- with .Values.imageBuilder.buildkit.topologySpreadConstraints }} +topologySpreadConstraints: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "imagebuilder.buildkit.scheduling.affinity" -}} +{{- with .Values.imageBuilder.buildkit.affinity }} +affinity: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "imagebuilder.buildkit.scheduling.nodeSelector" -}} +{{- with .Values.imageBuilder.buildkit.nodeSelector }} +nodeSelector: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "imagebuilder.buildkit.scheduling.nodeName" -}} +{{- with .Values.imageBuilder.buildkit.nodeName }} +nodeName: {{ toYaml . }} +{{- end }} +{{- end }} + +{{- define "imagebuilder.buildkit.scheduling.tolerations" -}} +{{- with .Values.imageBuilder.buildkit.tolerations }} +tolerations: +{{ toYaml . | nindent 2 }} +{{- end }} +{{- end }} + +{{- define "imagebuilder.buildkit.scheduling" -}} +{{- if .Values.imageBuilder.buildkit.topologySpreadConstraints }} +{{- include "imagebuilder.buildkit.scheduling.topologySpreadConstraints" . }} +{{- else }} +{{- include "global.scheduling.topologySpreadConstraints" . }} +{{- end }} +{{- if .Values.imageBuilder.buildkit.affinity }} +{{- include "imagebuilder.buildkit.scheduling.affinity" . }} +{{- else }} +{{- include "global.scheduling.affinity" . }} +{{- end }} +{{- if .Values.imageBuilder.buildkit.nodeSelector }} +{{- include "imagebuilder.buildkit.scheduling.nodeSelector" . }} +{{- else }} +{{- include "global.scheduling.nodeSelector" . }} +{{- end }} +{{- if .Values.imageBuilder.buildkit.nodeName }} +{{- include "imagebuilder.buildkit.scheduling.nodeName" . }} +{{- else }} +{{- include "global.scheduling.nodeName" . }} +{{- end }} +{{- if .Values.imageBuilder.buildkit.tolerations }} +{{- include "imagebuilder.buildkit.scheduling.tolerations" . }} +{{- else }} +{{- include "global.scheduling.tolerations" . }} +{{- end }} +{{- end -}} + {{/* Global service account annotations */}} diff --git a/charts/dataplane/templates/flyteconnector/deployment.yaml b/charts/dataplane/templates/flyteconnector/deployment.yaml index a6e60737..0dc9a3cd 100644 --- a/charts/dataplane/templates/flyteconnector/deployment.yaml +++ b/charts/dataplane/templates/flyteconnector/deployment.yaml @@ -62,13 +62,5 @@ spec: {{- with .Values.flyteconnector.additionalVolumes -}} {{ tpl (toYaml .) $ | nindent 6 }} {{- end }} - {{- with .Values.flyteconnector.nodeSelector }} - nodeSelector: {{ tpl (toYaml .) $ | nindent 8 }} - {{- end }} - {{- with .Values.flyteconnector.affinity }} - affinity: {{ tpl (toYaml .) $ | nindent 8 }} - {{- end }} - {{- with .Values.flyteconnector.tolerations }} - tolerations: {{ tpl (toYaml .) $ | nindent 8 }} - {{- end }} + {{- include "flyteconnector.scheduling" . | nindent 6 }} {{- end }} diff --git a/charts/dataplane/templates/imagebuilder/deployment.yaml b/charts/dataplane/templates/imagebuilder/deployment.yaml index cdee1452..45370550 100644 --- a/charts/dataplane/templates/imagebuilder/deployment.yaml +++ b/charts/dataplane/templates/imagebuilder/deployment.yaml @@ -100,9 +100,10 @@ spec: {{- with .Values.imageBuilder.buildkit.additionalVolumes -}} {{ tpl (toYaml .) $ | nindent 6 }} {{- end }} - {{- with .Values.imageBuilder.buildkit.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} + {{- if .Values.imageBuilder.buildkit.nodeSelector }} + {{- include "imagebuilder.buildkit.scheduling.nodeSelector" . | nindent 6 }} + {{- else if .Values.scheduling.nodeSelector }} + {{- include "global.scheduling.nodeSelector" . | nindent 6 }} {{- end }} affinity: podAntiAffinity: @@ -111,8 +112,9 @@ spec: matchLabels: {{- include "imagebuilder.buildkit.selectorLabels" . | nindent 16 }} topologyKey: "kubernetes.io/hostname" - {{- with .Values.imageBuilder.buildkit.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} + {{- if .Values.imageBuilder.buildkit.tolerations }} + {{- include "imagebuilder.buildkit.scheduling.tolerations" . | nindent 6 }} + {{- else if .Values.scheduling.tolerations }} + {{- include "global.scheduling.tolerations" . | nindent 6 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/dataplane/templates/prometheus/deployment.yaml b/charts/dataplane/templates/prometheus/deployment.yaml index ab7459e1..5ce213fd 100644 --- a/charts/dataplane/templates/prometheus/deployment.yaml +++ b/charts/dataplane/templates/prometheus/deployment.yaml @@ -55,15 +55,4 @@ spec: - name: prometheus-config configMap: name: {{ include "union-operator.fullname" . }}-prometheus - {{- with .Values.prometheus.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.prometheus.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.prometheus.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} + {{- include "prometheus.scheduling" . | nindent 6 }} diff --git a/charts/dataplane/values.yaml b/charts/dataplane/values.yaml index 97dc1707..c1829e74 100644 --- a/charts/dataplane/values.yaml +++ b/charts/dataplane/values.yaml @@ -769,6 +769,12 @@ opencost: limits: cpu: 1000m memory: 4Gi + # -- Tolerations for opencost pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for opencost pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} + # -- Affinity rules for opencost pods. + affinity: {} # -- Configuration for fluentbit used for the persistent logging feature. # FluentBit runs as a DaemonSet and ships container logs to the persisted-logs/ @@ -1164,6 +1170,10 @@ image: metrics-server: enabled: false + # -- Tolerations for metrics-server pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for metrics-server pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} # -- nodeobserver contains the configuration information for the node observer service. nodeobserver: @@ -1334,7 +1344,11 @@ prometheus: # -- Standalone kube-state-metrics for Union features (cost tracking, pod resource metrics). # Metric filtering is handled in the Prometheus static scrape config. -kube-state-metrics: {} +kube-state-metrics: + # -- Tolerations for kube-state-metrics pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for kube-state-metrics pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} # -- Scopes the deployment, permissions and actions created into a single namespace low_privilege: false @@ -1704,6 +1718,10 @@ monitoring: prometheusOperator: enabled: true + # -- Tolerations for prometheus-operator pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for prometheus-operator pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} # CRDs should be installed separately via the dataplane-crds chart # (set crds.prometheusOperator: true) before enabling the monitoring stack. @@ -1727,6 +1745,11 @@ monitoring: # Should override for production deployments adminPassword: admin + # -- Tolerations for grafana pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for grafana pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} + # Default monitoring stack for all relevant K8s components that impact # Union performance and reliability. coreDns: @@ -1753,6 +1776,10 @@ monitoring: kube-state-metrics: nameOverride: "monitoring-kube-state-metrics" fullnameOverride: "monitoring-kube-state-metrics" + # -- Tolerations for monitoring kube-state-metrics pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for monitoring kube-state-metrics pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} # By default, install a separate Prometheus instance for monitoring. # This is the simplest, out of the box model, it is highly recommended that users look @@ -1782,3 +1809,8 @@ monitoring: requests: cpu: "500m" memory: "1Gi" + + # -- Tolerations for monitoring prometheus pods. Set to match scheduling.tolerations when using dedicated node pools. + tolerations: [] + # -- Node selector for monitoring prometheus pods. Set to match scheduling.nodeSelector when using dedicated node pools. + nodeSelector: {} diff --git a/tests/generated/dataplane.global-scheduling.yaml b/tests/generated/dataplane.global-scheduling.yaml new file mode 100644 index 00000000..164175a8 --- /dev/null +++ b/tests/generated/dataplane.global-scheduling.yaml @@ -0,0 +1,6401 @@ +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-development +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-staging +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-production +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-development +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-staging +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-production +--- +# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluentbit-system + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics + namespace: union +--- +# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +automountServiceAccountToken: true +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/imagebuilder/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-imagebuilder +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: executor + namespace: union + labels: + app: executor +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/prometheus/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flytepropeller-webhook-system + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flytepropeller-system + namespace: union +--- +# Source: dataplane/templates/common/auth-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: union-secret-auth + namespace: union +type: Opaque +data: + # TODO(rob): update or configure operator to use client_secret like all the other components. + app_secret: dGVzdC1zZWNyZXQ= + client_secret: dGVzdC1zZWNyZXQ= +--- +# Source: dataplane/templates/common/cluster-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: operator-cluster-name +type: Opaque +data: + cluster_name: dW5pb24tdGVzdA== +--- +# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Create an empty secret that the first propeller pod will populate +apiVersion: v1 +kind: Secret +metadata: + name: flyte-pod-webhook + namespace: union +type: Opaque +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-clusterresourcesync-config + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + cluster_resources.yaml: | + cluster_resources: + clusterName: 'union-test' + customData: + - production: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - staging: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - development: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + refreshInterval: 5m + standaloneDeployment: true + templatePath: /etc/flyte/clusterresource/templates + clusterResourcesPrivate: + app: + isServerless: false + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + connection: + host: dns:///union.test.union.ai + admin.yaml: | + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + event: + capacity: 1000 + rate: 500 + type: admin + domain.yaml: | + domains: + - id: development + name: development + - id: staging + name: staging + - id: production + name: production + clusters.yaml: | + clusters: + clusterConfigs: [] + labelClusterMap: {} + logger.yaml: | + logger: + level: 4 + show-source: true +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-clusterresource-template + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + a_namespace.yaml: | + apiVersion: v1 + kind: Namespace + metadata: + name: {{ namespace }} + labels: + union.ai/namespace-type: flyte + spec: + finalizers: + - kubernetes + + b_default_service_account.yaml: | + apiVersion: v1 + kind: ServiceAccount + metadata: + name: default + namespace: {{ namespace }} + annotations: + {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + + c_project_resource_quota.yaml: | + apiVersion: v1 + kind: ResourceQuota + metadata: + name: project-quota + namespace: {{ namespace }} + spec: + hard: + limits.cpu: {{ projectQuotaCpu }} + limits.memory: {{ projectQuotaMemory }} + requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} +--- +# Source: dataplane/templates/fluent-bit/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentbit-system + namespace: union + labels: + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +data: + custom_parsers.conf: | + [PARSER] + Name docker_no_time + Format json + Time_Keep Off + Time_Key time + Time_Format %Y-%m-%dT%H:%M:%S.%L + fluent-bit.conf: | + [SERVICE] + Parsers_File /fluent-bit/etc/parsers.conf + Parsers_File /fluent-bit/etc/conf/custom_parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + Health_Check On + [INPUT] + Name tail + Tag namespace-.pod-.cont- + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)- + Path /var/log/containers/*.log + DB /var/log/flb_kube.db + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + + + [OUTPUT] + Name s3 + Match * + upload_timeout 1m + s3_key_format /persisted-logs/$TAG + static_file_path true + json_date_key false + region us-east-1 + bucket test-bucket +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | + debug = false + + [log] + format = "text" + + [worker.oci] + enabled = true + snapshotter = "auto" + gc = true + max-parallelism = 0 + + # Should not be used if Policies are defined + gckeepstorage = "10%" + [[worker.oci.gcpolicy]] + # Remove COPY/ADD and git checkout files + keepBytes = "10%" + keepDuration = "24h" + filters = [ "type==source.local", "type==source.git.checkout" ] + [[worker.oci.gcpolicy]] + # Remove locally cached image layers after it's unused for 24 hours + keepBytes = "10%" + keepDuration = "24h" + filters = [ "regular" ] + [[worker.oci.gcpolicy]] + # Remove shared cache mounts. E.G. Pip cache + keepBytes = "10%" + keepDuration = "72h" + filters = [ "type==exec.cachemount" ] + [[worker.oci.gcpolicy]] + # Remove everything else to keep the cache size under total file system limit + all = true + keepBytes = "80%" +--- +# Source: dataplane/templates/monitoring/dashboard-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-dashboard-union-dataplane-overview + namespace: union + labels: + grafana_dashboard: "1" + app.kubernetes.io/managed-by: Helm +data: + union-dataplane-overview.json: |- + { + "annotations": { + "list": [] + }, + "description": "Union Dataplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Active Workflows", + "type": "stat", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + } + ], + "description": "Current active FlyteWorkflow CRD count managed by Propeller." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Queue Depth", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", + "legendFormat": "Main", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", + "legendFormat": "Sub", + "refId": "B" + } + ], + "description": "Main and sub workqueue depth over time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all DP deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Execution Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)", + "refId": "A" + } + ], + "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Propeller Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", + "refId": "A" + } + ], + "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "DP service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 200, + "title": "Union Operator", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 201, + "title": "Work Queue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Processed", + "refId": "A" + }, + { + "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + } + ], + "description": "Operator execution operation processing rate and failure rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 202, + "title": "Background Process Runs / Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status runs", + "refId": "C" + }, + { + "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status errors", + "refId": "D" + }, + { + "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Prom health errors", + "refId": "E" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 203, + "title": "Heartbeat Latency", + "type": "timeseries", + "targets": [ + { + "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Capabilities p90", + "refId": "A" + }, + { + "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Usages p90", + "refId": "B" + }, + { + "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "List WFs p90", + "refId": "C" + } + ], + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 204, + "title": "Config Syncer", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller CM updated", + "refId": "C" + } + ], + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 205, + "title": "Billable Usage Collector", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 206, + "title": "Work Queue Paused", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}", + "legendFormat": "Paused", + "refId": "A" + } + ], + "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 300, + "title": "Executor (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 301, + "title": "Active Actions & Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "legendFormat": "Active actions", + "refId": "A" + }, + { + "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "legendFormat": "Available capacity", + "refId": "B" + } + ], + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 302, + "title": "Cache Discovery", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Miss", + "refId": "A" + }, + { + "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Put success", + "refId": "B" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 303, + "title": "Actions Terminated by Phase", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ phase }}", + "refId": "A" + } + ], + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 304, + "title": "Evaluator Duration (pod creation)", + "type": "timeseries", + "targets": [ + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Evaluate p50", + "refId": "A" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Evaluate p90", + "refId": "B" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "Evaluate p99", + "refId": "C" + } + ], + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 305, + "title": "System Failures & Invalid Leases", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "System failures", + "refId": "A" + }, + { + "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Exhausted retries", + "refId": "B" + }, + { + "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Invalid leases", + "refId": "C" + }, + { + "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Evaluate errors", + "refId": "D" + } + ], + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 100, + "title": "Flyte Propeller (V1)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 101, + "title": "Round Time (p50 / p90 / p99)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 102, + "title": "Round Success / Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Success", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Panics", + "refId": "C" + } + ], + "description": "Propeller round outcomes: success, errors, and panics per second." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 103, + "title": "Free Workers", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", + "legendFormat": "Free workers", + "refId": "A" + } + ], + "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 104, + "title": "Queue Add Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main adds", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sub adds", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main retries", + "refId": "C" + } + ], + "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 105, + "title": "Workflow Updates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Updated", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Too large", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Conflict", + "refId": "D" + } + ], + "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 106, + "title": "Workflow Update Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "etcd write latency for FlyteWorkflow status updates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 107, + "title": "Node Queueing & Execution Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Queue p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Queue p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", + "legendFormat": "Exec p90 (ms)", + "refId": "C" + } + ], + "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 108, + "title": "Metastore Cache Hit Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Hit rate", + "refId": "A" + } + ], + "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 109, + "title": "Event Recording (DP \u2192 CP)", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task success", + "refId": "A" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node success", + "refId": "B" + }, + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task failure", + "refId": "C" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node failure", + "refId": "D" + } + ], + "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 110, + "title": "Cache Discovery (hit/miss/skip)", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Skips", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "D" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 111, + "title": "K8s API Client Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "K8s requests/s", + "refId": "A" + } + ], + "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 112, + "title": "K8s API Client Latency (p90)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Request p90", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Rate limiter p90", + "refId": "B" + } + ], + "description": "K8s API request latency and client-side rate limiter wait time at p90." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 401, + "title": "gRPC Client Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 402, + "title": "gRPC Client Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 403, + "title": "gRPC Client Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "dataplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "union", + "value": "union" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "union", + "value": "union" + } + ], + "query": "union", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Dataplane Overview", + "uid": "union-dp-overview", + "version": 1 + } +--- +# Source: dataplane/templates/nodeexecutor/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: executor + namespace: union + labels: + app: executor +data: + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + dynamic-log-links: + - vscode: + displayName: VS Code Debugger + linkType: ide + templateUris: + - /dataplane/pod/v1/generated_name/6060/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/union-test/{{.namespace}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/{{.generatedName}}/ + - wandb-execution-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .podName }}' + - wandb-custom-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .taskConfig.id }}' + - comet-ml-execution-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .executionName }}{{ .nodeId }}{{ + .taskRetryAttempt }}{{ .taskConfig.link_suffix }}' + - comet-ml-custom-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .taskConfig.experiment_key }}' + - neptune-scale-run: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .podName }} + - neptune-scale-custom-id: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .taskConfig.id }} + kubernetes-enabled: true + enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + config.yaml: | + executor: + cluster: 'union-test' + evaluatorCount: 64 + maxActions: 2000 + organization: 'union' + unionAuth: + injectSecret: true + secretName: EAGER_API_KEY + workerName: worker1 + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 4096 + gpu: 256 + memory: 2Ti + union: + connection: + host: dns:///union.test.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + authorizer: + type: noop + catalog-cache: + cache-endpoint: dns:///union.test.union.ai + endpoint: dns:///union.test.union.ai + insecure: false + type: fallback + use-admin-auth: true + logger: + level: 4 + show-source: true + sharedService: + metrics: + scope: 'executor:' + security: + allowCors: true + allowLocalhostAccess: true + allowedHeaders: + - Content-Type + allowedOrigins: + - '*' + secure: false + useAuth: false + propeller: + node-config: + disable-input-file-writes: true + plugins: + fasttask: + additional-worker-args: + - --last-ack-grace-period-seconds + - "120" + callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + grace-period-status-not-found: 2m + ioutils: + remoteFileOutputPaths: + deckFilename: report.html + k8s: + disable-inject-owner-references: true + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/templates/operator/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + config.yaml: | + union: + connection: + host: dns:///union.test.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + sharedService: + features: + gatewayV2: true + port: 8081 + authorizer: + type: noop + operator: + enabled: true + enableTunnelService: true + tunnel: + enableDirectToAppIngress: false + deploymentToRestart: union-operator-proxy + apps: + enabled: 'false' + syncClusterConfig: + enabled: false + clusterId: + organization: 'union' + clusterData: + appId: 'test-client' + bucketName: 'test-bucket' + bucketRegion: 'us-east-1' + cloudHostName: 'union.test.union.ai' + gcpProjectId: '' + metadataBucketPrefix: 's3://test-bucket' + userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + userRoleKey: 'eks.amazonaws.com/role-arn' + collectUsages: + enabled: true + billing: + model: Legacy + dependenciesHeartbeat: + prometheus: + endpoint: 'http://union-operator-prometheus:80/-/healthy' + propeller: + endpoint: 'http://flytepropeller:10254' + proxy: + endpoint: 'http://union-operator-proxy:10254' + imageBuilder: + enabled: true + executionNamespaceLabels: + union.ai/namespace-type: flyte + referenceConfigmapName: union-operator + targetConfigMapName: "build-image-config" + proxy: + imageBuilderConfig: + authenticationType: 'noop' + defaultRepository: '' + persistedLogs: + objectStore: + pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} + prefix: persisted-logs + sourceType: ObjectStore + smConfig: + enabled: 'true' + k8sConfig: + namespace: 'union' + type: 'K8s' + logger.yaml: | + logger: + level: 4 + show-source: true + config-overrides.yaml: | + cache: + identity: + enabled: false + storage.yaml: | + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + fast_registration_storage.yaml: | + fastRegistrationStorage: + container: "" + type: s3 + connection: + auth-type: iam + region: us-east-1 + image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + image-builder.default-repository: "" + image-builder.authentication-type: "noop" +--- +# Source: dataplane/templates/prometheus/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + alerting: + alertmanagers: + - static_configs: + - targets: + rule_files: + - rules.yml + scrape_configs: + # Self-monitoring + - job_name: prometheus + metrics_path: /prometheus/metrics + static_configs: + - targets: ['localhost:9090'] + metric_relabel_configs: + - source_labels: [__name__] + regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total + action: keep + + # Kube state metrics for pod/node resource tracking and cost calculations + - job_name: kube-state-metrics + static_configs: + - targets: ['release-name-kube-state-metrics:8080'] + metric_relabel_configs: + - separator: ; + source_labels: [__name__] + regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total + action: keep + - separator: ; + source_labels: [__name__, phase] + regex: kube_pod_status_phase;(Succeeded|Failed) + action: drop + - source_labels: [node] + target_label: nodename + regex: '(.*)' + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: (.+) + target_label: label_node_pool_name + + # cAdvisor container metrics for CPU and memory tracking + - job_name: kubernetes-cadvisor + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + metric_relabel_configs: + - separator: ; + source_labels: [__name__] + regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes + action: keep + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + action: replace + + # Flyte propeller metrics for execution info and fast task duration + - job_name: flytepropeller + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - union + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # OpenCost metrics for cost tracking + - job_name: opencost + static_configs: + - targets: ['release-name-opencost:9003'] + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + rules.yml: | + + groups: + - name: cost_calculations_15s + interval: 15s + rules: + - record: pod_gpu_allocation + expr: | + sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) + - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ) + ) + - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( + label_replace( + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps + "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup + ), + "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key + ) + ) + - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_workspace_name, label_entity_id)( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces + "label_entity_id", "$1", "label_node_id", "(.*)" # join key + ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels + ) + ) + - record: fast_task_execution_duration + expr: | + max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ), + "namespace", "$1", "exported_namespace", "(.*)" + ), + "pod", "$1", "exported_pod", "(.*)" + ) + ) + - record: fast_task_execution_duration_rate + expr: | + irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration + - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity + # First, calculate the allocated memory for each pod + max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory + ( + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} + ) + ) + or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, calculate the allocated cpu for each pod + max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu + ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity + # First, calculate the used memory for each pod + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:mem_usage_bytes_total_per_node:sum + ) + - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:cpu_usage_per_node:sum + ) + - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity + expr: | + avg by (label_entity_type, label_domain, label_project, label_entity_id) ( + # First, grab the SM occupancy for each pod + max by (namespace, pod) ( + DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) + expr: | + entity_id:sm_occupancy:avg + * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum + - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. + expr: | + label_replace( + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:allocated_mem_cost:sum + or + entity_id:allocated_cpu_cost:sum + or + entity_id:allocated_gpu_cost:sum + ), + "type", "allocated", "", "" # add type info + ) + - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) + expr: | + label_replace( + sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) + # Start with each execution's and app's allocated cost per node + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity + / on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts + ) + # Then multiply by the overhead cost per node + * on (node) group_left() ( + # To calculate overhead, start with the true cost of running each node + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes + * on (node) max by (node) ( + node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts + ) * (15 / 3600) # convert hourly cost to 15-secondly cost + # Then subtract out the total allocated cost on each node + - on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + ) + ) + ), + "type", "overhead", "", "" # add type info + ) + - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) + expr: | + label_replace( + sum by (label_domain, label_project, label_entity_id, label_entity_type) ( + entity_id:allocated_cost:sum + or + entity_id:overhead_cost:sum + ), + "type", "total", "", "" # add type info + ) + - record: node:total_cost:sum # Total cost of all nodes + expr: | + sum ( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost + ) + - record: node_type:total_cost:sum # Total cost of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label + ) + - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node, node_type)( # dedupe + label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel + ) + ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics + - name: cost_rollup_15m + interval: 15m + rules: + - record: execution_info15m + expr: | + max_over_time(execution_info[15m:15s]) + - record: app_info15m + expr: | + max_over_time(app_info[15m:15s]) + - record: workspace_info15m + expr: | + max_over_time(workspace_info[15m:15s]) + - record: entity_id:allocated_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) + - record: entity_id:used_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) + - record: entity_id:allocated_cpu:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) + - record: entity_id:used_cpu:sum15m + expr: | + sum_over_time(entity_id:used_cpu:sum[15m:15s]) + - record: entity_id:weighted_sm_occupancy:sum15m + expr: | + sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) + - record: entity_id:gpu_count:sum15m + expr: | + sum_over_time(entity_id:gpu_count:sum[15m:15s]) + - record: entity_id:allocated_mem_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) + - record: entity_id:allocated_cpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) + - record: entity_id:allocated_gpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) + - record: entity_id:allocated_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cost:sum[15m:15s]) + - record: entity_id:overhead_cost:sum15m + expr: | + sum_over_time(entity_id:overhead_cost:sum[15m:15s]) + - record: entity_id:total_cost:sum15m + expr: | + sum_over_time(entity_id:total_cost:sum[15m:15s]) + - record: node:total_cost:sum15m + expr: | + sum_over_time(node:total_cost:sum[15m:15s]) + - record: node_type:total_cost:sum15m + expr: | + sum_over_time(node_type:total_cost:sum[15m:15s]) + - record: node_type:uptime_hours:sum15m + expr: | + sum_over_time(node_type:uptime_hours:sum[15m:15s]) +--- +# Source: dataplane/templates/propeller/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-propeller-config + namespace: union +data: + admin.yaml: | + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + event: + capacity: 1000 + rate: 500 + type: admin + catalog.yaml: | + catalog-cache: + cache-endpoint: dns:///union.test.union.ai + endpoint: dns:///union.test.union.ai + insecure: false + type: fallback + use-admin-auth: true + copilot.yaml: | + plugins: + k8s: + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + core.yaml: | + propeller: + downstream-eval-duration: 30s + enable-admin-launcher: true + leader-election: + enabled: true + lease-duration: 15s + lock-config-map: + name: propeller-leader + namespace: 'union' + renew-deadline: 10s + retry-period: 2s + limit-namespace: all + literal-offloading-config: + enabled: true + max-workflow-retries: 30 + metadata-prefix: metadata/propeller + metrics-prefix: flyte + prof-port: 10254 + queue: + batch-size: -1 + batching-interval: 2s + queue: + base-delay: 5s + capacity: 1000 + max-delay: 120s + rate: 100 + type: maxof + sub-queue: + capacity: 100 + rate: 10 + type: bucket + type: batch + rawoutput-prefix: 's3://test-bucket' + workers: 4 + workflow-reeval-duration: 30s + webhook: + certDir: /etc/webhook/certs + embeddedSecretManagerConfig: + imagePullSecrets: + enabled: true + k8sConfig: + namespace: 'union' + type: 'K8s' + listenPort: '9443' + secretManagerTypes: + - Embedded + - K8s + serviceName: flyte-pod-webhook + servicePort: '443' + enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + logger.yaml: | + logger: + level: 4 + show-source: true + resource_manager.yaml: | + propeller: + resourcemanager: + type: noop + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + dynamic-log-links: + - vscode: + displayName: VS Code Debugger + templateUris: + - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ + kubernetes-enabled: false + templates: + - displayName: Task Logs + scheme: TaskExecution + templateUris: + - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true + storage.yaml: | + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +# Source: dataplane/charts/opencost/templates/clusterrole.yaml +# Cluster role giving opencost to get, list, watch required resources +# No write permissions are required +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] + resources: + - configmaps + - deployments + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - deployments + - daemonsets + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - get + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-clustersync-resource +rules: + - apiGroups: + - "" + - rbac.authorization.k8s.io + resources: + - configmaps + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - clusterrolebindings + - podtemplates + verbs: + - '*' +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + # Allow Access to all resources under flyte.lyft.com + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - nonResourceURLs: + - /metrics + verbs: + - get +--- +# Source: dataplane/templates/prometheus/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-operator-prometheus + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - pods + - endpoints + - services + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + - /metrics/cadvisor + verbs: + - get +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flytepropeller-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: flytepropeller-role +rules: + # Allow RO access to PODS + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + # Allow Event recording access + - apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch + # Allow Access All plugin objects + - apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + # Allow Access to CRD + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update + # Allow Access to all resources under flyte.lyft.com + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: fluentbit-system + namespace: union +--- +# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-kube-state-metrics +subjects: +- kind: ServiceAccount + name: release-name-kube-state-metrics + namespace: union +--- +# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-opencost +subjects: + - kind: ServiceAccount + name: release-name-opencost + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-clustersync-resource +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-clustersync-resource +subjects: + - kind: ServiceAccount + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-clustersync-auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: + - kind: ServiceAccount + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-executor +subjects: +- kind: ServiceAccount + name: executor + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: proxy-system +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/templates/prometheus/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-operator-prometheus + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-operator-prometheus +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Create a binding from Role -> ServiceAccount +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flytepropeller-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flytepropeller-webhook-role +subjects: + - kind: ServiceAccount + name: flytepropeller-webhook-system + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: flytepropeller-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flytepropeller-role +subjects: + - kind: ServiceAccount + name: flytepropeller-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + - deployments + verbs: + - get + - list + - watch + - create + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system-secret +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/charts/fluentbit/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 2020 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + annotations: + prometheus.io/scrape: 'true' +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/opencost/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + selector: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + type: "ClusterIP" + ports: + - name: http + port: 9003 + targetPort: 9003 +--- +# Source: dataplane/templates/clusterresourcesync/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: syncresources + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/imagebuilder/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 1234 + targetPort: tcp + protocol: TCP + name: tcp + selector: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/nodeexecutor/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-dataplane-executor + labels: + platform.union.ai/prometheus-group: "union-services" + app: executor +spec: + type: ClusterIP + ports: + - port: 15605 + targetPort: 15605 + protocol: TCP + name: fasttask + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app: executor +--- +# Source: dataplane/templates/operator/service-proxy.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-proxy + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/operator/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/prometheus/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 9090 + protocol: TCP + name: http + selector: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/propeller/service-webhook.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyte-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + projectcontour.io/upstream-protocol.h2c: grpc +spec: + selector: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: https + protocol: TCP + port: 443 + targetPort: 9443 + - name: debug + protocol: TCP + port: 10254 + targetPort: 10254 +--- +# Source: dataplane/templates/propeller/service-webhook.yaml +# Headless Service for cache invalidation — resolves to all pod IPs so that +# we can fan out invalidation requests to every webhook replica. +apiVersion: v1 +kind: Service +metadata: + name: flyte-pod-webhook-headless + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + selector: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: cache-internal + protocol: TCP + port: 9443 + targetPort: 9443 +--- +# Source: dataplane/templates/propeller/service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: union + name: flytepropeller + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: debug + protocol: TCP + port: 10254 + - name: fasttask + port: 15605 + protocol: TCP + targetPort: 15605 + selector: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/fluentbit/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + annotations: + checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + spec: + serviceAccountName: fluentbit-system + hostNetwork: false + dnsPolicy: ClusterFirst + containers: + - name: fluentbit + image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8" + imagePullPolicy: IfNotPresent + command: + - /fluent-bit/bin/fluent-bit + args: + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + ports: + - name: http + containerPort: 2020 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: /api/v1/health + port: http + volumeMounts: + - name: config + mountPath: /fluent-bit/etc/conf + - mountPath: /var/log + name: varlog + - mountPath: /var/lib/docker/containers + name: varlibdockercontainers + readOnly: true + - mountPath: /etc/machine-id + name: etcmachineid + readOnly: true + volumes: + - name: config + configMap: + name: fluentbit-system + - hostPath: + path: /var/log + name: varlog + - hostPath: + path: /var/lib/docker/containers + name: varlibdockercontainers + - hostPath: + path: /etc/machine-id + type: File + name: etcmachineid + tolerations: + - operator: Exists +--- +# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: release-name-kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: kube-state-metrics + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: dataplane/charts/opencost/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: release-name-opencost + containers: + - name: release-name-opencost + image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 + imagePullPolicy: IfNotPresent + args: + ports: + - containerPort: 9003 + name: http + resources: + limits: + cpu: 1000m + memory: 4Gi + requests: + cpu: 500m + memory: 1Gi + startupProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 30 + livenessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 20 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + env: + - name: LOG_LEVEL + value: info + - name: CUSTOM_COST_ENABLED + value: "false" + - name: KUBECOST_NAMESPACE + value: union + - name: API_PORT + value: "9003" + - name: PROMETHEUS_SERVER_ENDPOINT + value: "http://union-operator-prometheus.union.svc:80/prometheus" + - name: CLUSTER_ID + value: "default-cluster" + - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS + value: "15" + - name: CLOUD_COST_ENABLED + value: "false" + - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL + value: "6" + - name: CLOUD_COST_REFRESH_RATE_HOURS + value: "6" + - name: CLOUD_COST_QUERY_WINDOW_DAYS + value: "7" + - name: CLOUD_COST_RUN_WINDOW_DAYS + value: "3" + # Add any additional provided variables +--- +# Source: dataplane/templates/clusterresourcesync/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-syncresources + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "b511750d960c272bb6a4f3ddbbfd46cfcaf0f7dfa7c3e4348c14af517722b00" + + labels: + + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - clusterresource + - --config + - /etc/flyte/config/*.yaml + - clusterresource + - run + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + name: sync-cluster-resources + resources: + limits: + cpu: "1" + memory: 500Mi + requests: + cpu: 500m + memory: 100Mi + volumeMounts: + - name: auth + mountPath: /etc/union/secret + - name: resource-templates + mountPath: /etc/flyte/clusterresource/templates + - name: config-volume + mountPath: /etc/flyte/config + ports: + - name: debug + containerPort: 10254 + protocol: TCP + serviceAccountName: union-clustersync-system + volumes: + - configMap: + name: union-clusterresource-template + name: resource-templates + - configMap: + name: union-clusterresourcesync-config + name: config-volume + - name: auth + secret: + secretName: union-secret-auth + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - flyte + - serve + - connector + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" + imagePullPolicy: "IfNotPresent" + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "10" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/imagebuilder/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + container.apparmor.security.beta.kubernetes.io/buildkit: unconfined + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: "union-imagebuilder" + containers: + - name: "buildkit" + image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - mountPath: /home/user/.local/share/buildkit + name: buildkitd + - mountPath: /etc/buildkit + name: buildkit-config + args: + - --config + - /etc/buildkit/buildkitd.toml + - --addr + - unix:///run/user/1000/buildkit/buildkitd.sock + - --addr + - tcp://0.0.0.0:1234 + - --oci-worker-no-process-sandbox + ports: + - name: tcp + containerPort: 1234 + protocol: TCP + readinessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + livenessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + securityContext: + seccompProfile: # Needs Kubernetes >= 1.19 + type: Unconfined + runAsUser: 1000 + runAsGroup: 1000 + resources: + requests: + cpu: 1 + ephemeral-storage: 20Gi + memory: 1Gi + volumes: + - name: buildkitd + emptyDir: {} + - configMap: + name: union-operator-buildkit + name: buildkit-config + + nodeSelector: + app_pool: flyte + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" + + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/nodeexecutor/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executor + namespace: union + labels: + app: executor +spec: + replicas: 1 + selector: + matchLabels: + app: executor + template: + metadata: + annotations: + configChecksum: "3d931e5636192b94c904aa780a60effc2bb71861f72f22b448e711b33d41918" + + labels: + + app: executor + spec: + securityContext: + fsGroup: 1337 + serviceAccountName: executor + volumes: + - name: config-volume + configMap: + name: executor + - name: secret-volume + secret: + secretName: union-secret-auth + - name: auth + secret: + secretName: union-secret-auth + containers: + - name: executor + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + command: + - executor + - serve + - --config + - /etc/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + resources: + limits: + cpu: "4" + memory: "8Gi" + requests: + cpu: "1" + memory: "1Gi" + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: secret-volume + mountPath: /etc/union/secret + - name: auth + mountPath: /etc/secrets/ + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/operator/deployment-proxy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-proxy + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad" + + labels: + + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + volumes: + - name: config-volume + projected: + sources: + - configMap: + name: union-operator + - configMap: + name: union-clusterresourcesync-config + - name: secret-volume + secret: + secretName: union-secret-auth + serviceAccountName: proxy-system + securityContext: + {} + containers: + - name: operator-proxy + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + args: + - operator + - proxy + - --config + - /etc/union/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: connect + containerPort: 8080 + protocol: TCP + - name: grpc + containerPort: 8081 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + - name: "tunnel" + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + args: + - cloudflared + - tunnel + - --no-autoupdate + - run + - --token + - $(TUNNEL_TOKEN) + env: + - name: TUNNEL_TOKEN + valueFrom: + secretKeyRef: + name: union-secret-auth + key: tunnel_token + optional: true + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/operator/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad" + + labels: + + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + serviceAccountName: operator-system + securityContext: + {} + volumes: + - name: config-volume + configMap: + name: union-operator + - name: secret-volume + secret: + secretName: union-secret-auth + containers: + - name: operator + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + args: + - operator + - serve + - --config + - /etc/union/config/*.yaml + - --operator.clusterId.name + - "$(CLUSTER_NAME)" + - --operator.tunnel.k8sSecretName + - union-secret-auth + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/prometheus/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "9505483b28e45abfefda9a9791a7719382b61225386ddfbdfea71a459a1423e" + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + spec: + priorityClassName: system-cluster-critical + serviceAccountName: union-operator-prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: prometheus + image: "prom/prometheus:v3.3.1" + args: + - --config.file=/etc/prometheus/prometheus.yml + - --web.external-url=/prometheus/ + - --web.route-prefix=/prometheus/ + - --storage.tsdb.retention.time=3d + ports: + - name: http + containerPort: 9090 + protocol: TCP + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + volumes: + - name: prometheus-config + configMap: + name: union-operator-prometheus + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Create the actual deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flytepropeller-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e" + + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + serviceAccountName: flytepropeller-webhook-system + initContainers: + - name: generate-secrets + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - init-certs + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + containers: + - name: webhook + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + ports: + - containerPort: 9443 + - containerPort: 10254 + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + readOnly: true + - name: webhook-certs + mountPath: /etc/webhook/certs + readOnly: true + volumes: + - name: config-volume + configMap: + name: flyte-propeller-config + - name: webhook-certs + secret: + secretName: flyte-pod-webhook + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/propeller/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: union + name: flytepropeller + labels: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e" + + labels: + + + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + priorityClassName: system-cluster-critical + containers: + - command: + - flytepropeller + - --config + - /etc/flyte/config/*.yaml + - --propeller.cluster-id + - union-test + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + name: flytepropeller + ports: + - containerPort: 10254 + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + - name: auth + mountPath: /etc/union/secret + serviceAccountName: flytepropeller-system + volumes: + - configMap: + name: flyte-propeller-config + name: config-volume + - name: auth + secret: + secretName: union-secret-auth + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "release-name-fluentbit-test-connection" + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm + annotations: + helm.sh/hook: test + helm.sh/hook-delete-policy: hook-succeeded +spec: + containers: + - name: wget + image: "busybox:latest" + imagePullPolicy: Always + command: ["sh"] + args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"] + restartPolicy: Never diff --git a/tests/generated/dataplane.oci.yaml b/tests/generated/dataplane.oci.yaml index 6c239916..d21fa8ac 100644 --- a/tests/generated/dataplane.oci.yaml +++ b/tests/generated/dataplane.oci.yaml @@ -5490,6 +5490,9 @@ spec: - configMap: name: union-operator-buildkit name: buildkit-config + + nodeSelector: + flyte.org/node-role: worker affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -5498,6 +5501,12 @@ spec: app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name topologyKey: "kubernetes.io/hostname" + + tolerations: + - effect: NoSchedule + key: flyte.org/node-role + operator: Equal + value: worker --- # Source: dataplane/templates/nodeexecutor/deployment.yaml apiVersion: apps/v1 @@ -5951,6 +5960,14 @@ spec: - name: prometheus-config configMap: name: union-operator-prometheus + + nodeSelector: + flyte.org/node-role: worker + tolerations: + - effect: NoSchedule + key: flyte.org/node-role + operator: Equal + value: worker --- # Source: dataplane/templates/propeller/deployment-webhook.yaml # Create the actual deployment diff --git a/tests/generated/dataplane.scheduling-override.yaml b/tests/generated/dataplane.scheduling-override.yaml new file mode 100644 index 00000000..91377e51 --- /dev/null +++ b/tests/generated/dataplane.scheduling-override.yaml @@ -0,0 +1,6332 @@ +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-development +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-staging +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: flytesnacks-production +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-development +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-staging +--- +# Source: dataplane/templates/common/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: union-health-monitoring-production +--- +# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluentbit-system + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics + namespace: union +--- +# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +automountServiceAccountToken: true +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/imagebuilder/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-imagebuilder +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: executor + namespace: union + labels: + app: executor +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/prometheus/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flytepropeller-webhook-system + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flytepropeller-system + namespace: union +--- +# Source: dataplane/templates/common/auth-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: union-secret-auth + namespace: union +type: Opaque +data: + # TODO(rob): update or configure operator to use client_secret like all the other components. + app_secret: dGVzdC1zZWNyZXQ= + client_secret: dGVzdC1zZWNyZXQ= +--- +# Source: dataplane/templates/common/cluster-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: operator-cluster-name +type: Opaque +data: + cluster_name: dW5pb24tdGVzdA== +--- +# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Create an empty secret that the first propeller pod will populate +apiVersion: v1 +kind: Secret +metadata: + name: flyte-pod-webhook + namespace: union +type: Opaque +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-clusterresourcesync-config + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + cluster_resources.yaml: | + cluster_resources: + clusterName: 'union-test' + customData: + - production: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - staging: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - development: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + refreshInterval: 5m + standaloneDeployment: true + templatePath: /etc/flyte/clusterresource/templates + clusterResourcesPrivate: + app: + isServerless: false + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + connection: + host: dns:///union.test.union.ai + admin.yaml: | + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + event: + capacity: 1000 + rate: 500 + type: admin + domain.yaml: | + domains: + - id: development + name: development + - id: staging + name: staging + - id: production + name: production + clusters.yaml: | + clusters: + clusterConfigs: [] + labelClusterMap: {} + logger.yaml: | + logger: + level: 4 + show-source: true +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-clusterresource-template + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + a_namespace.yaml: | + apiVersion: v1 + kind: Namespace + metadata: + name: {{ namespace }} + labels: + union.ai/namespace-type: flyte + spec: + finalizers: + - kubernetes + + b_default_service_account.yaml: | + apiVersion: v1 + kind: ServiceAccount + metadata: + name: default + namespace: {{ namespace }} + annotations: + {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + + c_project_resource_quota.yaml: | + apiVersion: v1 + kind: ResourceQuota + metadata: + name: project-quota + namespace: {{ namespace }} + spec: + hard: + limits.cpu: {{ projectQuotaCpu }} + limits.memory: {{ projectQuotaMemory }} + requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} +--- +# Source: dataplane/templates/fluent-bit/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentbit-system + namespace: union + labels: + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +data: + custom_parsers.conf: | + [PARSER] + Name docker_no_time + Format json + Time_Keep Off + Time_Key time + Time_Format %Y-%m-%dT%H:%M:%S.%L + fluent-bit.conf: | + [SERVICE] + Parsers_File /fluent-bit/etc/parsers.conf + Parsers_File /fluent-bit/etc/conf/custom_parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + Health_Check On + [INPUT] + Name tail + Tag namespace-.pod-.cont- + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)- + Path /var/log/containers/*.log + DB /var/log/flb_kube.db + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + + + [OUTPUT] + Name s3 + Match * + upload_timeout 1m + s3_key_format /persisted-logs/$TAG + static_file_path true + json_date_key false + region us-east-1 + bucket test-bucket +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | + debug = false + + [log] + format = "text" + + [worker.oci] + enabled = true + snapshotter = "auto" + gc = true + max-parallelism = 0 + + # Should not be used if Policies are defined + gckeepstorage = "10%" + [[worker.oci.gcpolicy]] + # Remove COPY/ADD and git checkout files + keepBytes = "10%" + keepDuration = "24h" + filters = [ "type==source.local", "type==source.git.checkout" ] + [[worker.oci.gcpolicy]] + # Remove locally cached image layers after it's unused for 24 hours + keepBytes = "10%" + keepDuration = "24h" + filters = [ "regular" ] + [[worker.oci.gcpolicy]] + # Remove shared cache mounts. E.G. Pip cache + keepBytes = "10%" + keepDuration = "72h" + filters = [ "type==exec.cachemount" ] + [[worker.oci.gcpolicy]] + # Remove everything else to keep the cache size under total file system limit + all = true + keepBytes = "80%" +--- +# Source: dataplane/templates/monitoring/dashboard-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-dashboard-union-dataplane-overview + namespace: union + labels: + grafana_dashboard: "1" + app.kubernetes.io/managed-by: Helm +data: + union-dataplane-overview.json: |- + { + "annotations": { + "list": [] + }, + "description": "Union Dataplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Active Workflows", + "type": "stat", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + } + ], + "description": "Current active FlyteWorkflow CRD count managed by Propeller." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Queue Depth", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", + "legendFormat": "Main", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", + "legendFormat": "Sub", + "refId": "B" + } + ], + "description": "Main and sub workqueue depth over time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all DP deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Execution Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)", + "refId": "A" + } + ], + "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Propeller Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", + "refId": "A" + } + ], + "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "DP service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 200, + "title": "Union Operator", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 201, + "title": "Work Queue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Processed", + "refId": "A" + }, + { + "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + } + ], + "description": "Operator execution operation processing rate and failure rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 202, + "title": "Background Process Runs / Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status runs", + "refId": "C" + }, + { + "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status errors", + "refId": "D" + }, + { + "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Prom health errors", + "refId": "E" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 203, + "title": "Heartbeat Latency", + "type": "timeseries", + "targets": [ + { + "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Capabilities p90", + "refId": "A" + }, + { + "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Usages p90", + "refId": "B" + }, + { + "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "List WFs p90", + "refId": "C" + } + ], + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 204, + "title": "Config Syncer", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller CM updated", + "refId": "C" + } + ], + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 205, + "title": "Billable Usage Collector", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 206, + "title": "Work Queue Paused", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}", + "legendFormat": "Paused", + "refId": "A" + } + ], + "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 300, + "title": "Executor (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 301, + "title": "Active Actions & Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "legendFormat": "Active actions", + "refId": "A" + }, + { + "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "legendFormat": "Available capacity", + "refId": "B" + } + ], + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 302, + "title": "Cache Discovery", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Miss", + "refId": "A" + }, + { + "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Put success", + "refId": "B" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 303, + "title": "Actions Terminated by Phase", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ phase }}", + "refId": "A" + } + ], + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 304, + "title": "Evaluator Duration (pod creation)", + "type": "timeseries", + "targets": [ + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Evaluate p50", + "refId": "A" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Evaluate p90", + "refId": "B" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "Evaluate p99", + "refId": "C" + } + ], + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 305, + "title": "System Failures & Invalid Leases", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "System failures", + "refId": "A" + }, + { + "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Exhausted retries", + "refId": "B" + }, + { + "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Invalid leases", + "refId": "C" + }, + { + "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Evaluate errors", + "refId": "D" + } + ], + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 100, + "title": "Flyte Propeller (V1)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 101, + "title": "Round Time (p50 / p90 / p99)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 102, + "title": "Round Success / Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Success", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Panics", + "refId": "C" + } + ], + "description": "Propeller round outcomes: success, errors, and panics per second." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 103, + "title": "Free Workers", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", + "legendFormat": "Free workers", + "refId": "A" + } + ], + "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 104, + "title": "Queue Add Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main adds", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sub adds", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main retries", + "refId": "C" + } + ], + "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 105, + "title": "Workflow Updates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Updated", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Too large", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Conflict", + "refId": "D" + } + ], + "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 106, + "title": "Workflow Update Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "etcd write latency for FlyteWorkflow status updates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 107, + "title": "Node Queueing & Execution Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Queue p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Queue p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", + "legendFormat": "Exec p90 (ms)", + "refId": "C" + } + ], + "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 108, + "title": "Metastore Cache Hit Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Hit rate", + "refId": "A" + } + ], + "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 109, + "title": "Event Recording (DP \u2192 CP)", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task success", + "refId": "A" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node success", + "refId": "B" + }, + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task failure", + "refId": "C" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node failure", + "refId": "D" + } + ], + "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 110, + "title": "Cache Discovery (hit/miss/skip)", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Skips", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "D" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 111, + "title": "K8s API Client Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "K8s requests/s", + "refId": "A" + } + ], + "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 112, + "title": "K8s API Client Latency (p90)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Request p90", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Rate limiter p90", + "refId": "B" + } + ], + "description": "K8s API request latency and client-side rate limiter wait time at p90." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 401, + "title": "gRPC Client Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 402, + "title": "gRPC Client Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 403, + "title": "gRPC Client Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "dataplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "union", + "value": "union" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "union", + "value": "union" + } + ], + "query": "union", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Dataplane Overview", + "uid": "union-dp-overview", + "version": 1 + } +--- +# Source: dataplane/templates/nodeexecutor/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: executor + namespace: union + labels: + app: executor +data: + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + dynamic-log-links: + - vscode: + displayName: VS Code Debugger + linkType: ide + templateUris: + - /dataplane/pod/v1/generated_name/6060/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/union-test/{{.namespace}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/{{.generatedName}}/ + - wandb-execution-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .podName }}' + - wandb-custom-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .taskConfig.id }}' + - comet-ml-execution-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .executionName }}{{ .nodeId }}{{ + .taskRetryAttempt }}{{ .taskConfig.link_suffix }}' + - comet-ml-custom-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .taskConfig.experiment_key }}' + - neptune-scale-run: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .podName }} + - neptune-scale-custom-id: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .taskConfig.id }} + kubernetes-enabled: true + enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + config.yaml: | + executor: + cluster: 'union-test' + evaluatorCount: 64 + maxActions: 2000 + organization: 'union' + unionAuth: + injectSecret: true + secretName: EAGER_API_KEY + workerName: worker1 + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 4096 + gpu: 256 + memory: 2Ti + union: + connection: + host: dns:///union.test.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + authorizer: + type: noop + catalog-cache: + cache-endpoint: dns:///union.test.union.ai + endpoint: dns:///union.test.union.ai + insecure: false + type: fallback + use-admin-auth: true + logger: + level: 4 + show-source: true + sharedService: + metrics: + scope: 'executor:' + security: + allowCors: true + allowLocalhostAccess: true + allowedHeaders: + - Content-Type + allowedOrigins: + - '*' + secure: false + useAuth: false + propeller: + node-config: + disable-input-file-writes: true + plugins: + fasttask: + additional-worker-args: + - --last-ack-grace-period-seconds + - "120" + callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + grace-period-status-not-found: 2m + ioutils: + remoteFileOutputPaths: + deckFilename: report.html + k8s: + disable-inject-owner-references: true + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/templates/operator/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + config.yaml: | + union: + connection: + host: dns:///union.test.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + sharedService: + features: + gatewayV2: true + port: 8081 + authorizer: + type: noop + operator: + enabled: true + enableTunnelService: true + tunnel: + enableDirectToAppIngress: false + deploymentToRestart: union-operator-proxy + apps: + enabled: 'false' + syncClusterConfig: + enabled: false + clusterId: + organization: 'union' + clusterData: + appId: 'test-client' + bucketName: 'test-bucket' + bucketRegion: 'us-east-1' + cloudHostName: 'union.test.union.ai' + gcpProjectId: '' + metadataBucketPrefix: 's3://test-bucket' + userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + userRoleKey: 'eks.amazonaws.com/role-arn' + collectUsages: + enabled: true + billing: + model: Legacy + dependenciesHeartbeat: + prometheus: + endpoint: 'http://union-operator-prometheus:80/-/healthy' + propeller: + endpoint: 'http://flytepropeller:10254' + proxy: + endpoint: 'http://union-operator-proxy:10254' + imageBuilder: + enabled: true + executionNamespaceLabels: + union.ai/namespace-type: flyte + referenceConfigmapName: union-operator + targetConfigMapName: "build-image-config" + proxy: + imageBuilderConfig: + authenticationType: 'noop' + defaultRepository: '' + persistedLogs: + objectStore: + pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} + prefix: persisted-logs + sourceType: ObjectStore + smConfig: + enabled: 'true' + k8sConfig: + namespace: 'union' + type: 'K8s' + logger.yaml: | + logger: + level: 4 + show-source: true + config-overrides.yaml: | + cache: + identity: + enabled: false + storage.yaml: | + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + fast_registration_storage.yaml: | + fastRegistrationStorage: + container: "" + type: s3 + connection: + auth-type: iam + region: us-east-1 + image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + image-builder.default-repository: "" + image-builder.authentication-type: "noop" +--- +# Source: dataplane/templates/prometheus/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + alerting: + alertmanagers: + - static_configs: + - targets: + rule_files: + - rules.yml + scrape_configs: + # Self-monitoring + - job_name: prometheus + metrics_path: /prometheus/metrics + static_configs: + - targets: ['localhost:9090'] + metric_relabel_configs: + - source_labels: [__name__] + regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total + action: keep + + # Kube state metrics for pod/node resource tracking and cost calculations + - job_name: kube-state-metrics + static_configs: + - targets: ['release-name-kube-state-metrics:8080'] + metric_relabel_configs: + - separator: ; + source_labels: [__name__] + regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total + action: keep + - separator: ; + source_labels: [__name__, phase] + regex: kube_pod_status_phase;(Succeeded|Failed) + action: drop + - source_labels: [node] + target_label: nodename + regex: '(.*)' + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: (.+) + target_label: label_node_pool_name + + # cAdvisor container metrics for CPU and memory tracking + - job_name: kubernetes-cadvisor + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - role: node + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + metric_relabel_configs: + - separator: ; + source_labels: [__name__] + regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes + action: keep + relabel_configs: + - separator: ; + regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - separator: ; + regex: (.*) + target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + separator: ; + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + action: replace + + # Flyte propeller metrics for execution info and fast task duration + - job_name: flytepropeller + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - union + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # OpenCost metrics for cost tracking + - job_name: opencost + static_configs: + - targets: ['release-name-opencost:9003'] + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + rules.yml: | + + groups: + - name: cost_calculations_15s + interval: 15s + rules: + - record: pod_gpu_allocation + expr: | + sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) + - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ) + ) + - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( + label_replace( + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps + "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup + ), + "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key + ) + ) + - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_workspace_name, label_entity_id)( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces + "label_entity_id", "$1", "label_node_id", "(.*)" # join key + ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels + ) + ) + - record: fast_task_execution_duration + expr: | + max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ), + "namespace", "$1", "exported_namespace", "(.*)" + ), + "pod", "$1", "exported_pod", "(.*)" + ) + ) + - record: fast_task_execution_duration_rate + expr: | + irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration + - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity + # First, calculate the allocated memory for each pod + max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory + ( + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} + ) + ) + or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, calculate the allocated cpu for each pod + max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu + ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity + # First, calculate the used memory for each pod + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:mem_usage_bytes_total_per_node:sum + ) + - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:cpu_usage_per_node:sum + ) + - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity + expr: | + avg by (label_entity_type, label_domain, label_project, label_entity_id) ( + # First, grab the SM occupancy for each pod + max by (namespace, pod) ( + DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, + "label_entity_type", "fast_task", "", "" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) + expr: | + entity_id:sm_occupancy:avg + * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum + - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. + expr: | + label_replace( + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:allocated_mem_cost:sum + or + entity_id:allocated_cpu_cost:sum + or + entity_id:allocated_gpu_cost:sum + ), + "type", "allocated", "", "" # add type info + ) + - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) + expr: | + label_replace( + sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) + # Start with each execution's and app's allocated cost per node + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity + / on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts + ) + # Then multiply by the overhead cost per node + * on (node) group_left() ( + # To calculate overhead, start with the true cost of running each node + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes + * on (node) max by (node) ( + node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts + ) * (15 / 3600) # convert hourly cost to 15-secondly cost + # Then subtract out the total allocated cost on each node + - on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + ) + ) + ), + "type", "overhead", "", "" # add type info + ) + - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) + expr: | + label_replace( + sum by (label_domain, label_project, label_entity_id, label_entity_type) ( + entity_id:allocated_cost:sum + or + entity_id:overhead_cost:sum + ), + "type", "total", "", "" # add type info + ) + - record: node:total_cost:sum # Total cost of all nodes + expr: | + sum ( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost + ) + - record: node_type:total_cost:sum # Total cost of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label + ) + - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node, node_type)( # dedupe + label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel + ) + ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics + - name: cost_rollup_15m + interval: 15m + rules: + - record: execution_info15m + expr: | + max_over_time(execution_info[15m:15s]) + - record: app_info15m + expr: | + max_over_time(app_info[15m:15s]) + - record: workspace_info15m + expr: | + max_over_time(workspace_info[15m:15s]) + - record: entity_id:allocated_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) + - record: entity_id:used_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) + - record: entity_id:allocated_cpu:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) + - record: entity_id:used_cpu:sum15m + expr: | + sum_over_time(entity_id:used_cpu:sum[15m:15s]) + - record: entity_id:weighted_sm_occupancy:sum15m + expr: | + sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) + - record: entity_id:gpu_count:sum15m + expr: | + sum_over_time(entity_id:gpu_count:sum[15m:15s]) + - record: entity_id:allocated_mem_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) + - record: entity_id:allocated_cpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) + - record: entity_id:allocated_gpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) + - record: entity_id:allocated_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cost:sum[15m:15s]) + - record: entity_id:overhead_cost:sum15m + expr: | + sum_over_time(entity_id:overhead_cost:sum[15m:15s]) + - record: entity_id:total_cost:sum15m + expr: | + sum_over_time(entity_id:total_cost:sum[15m:15s]) + - record: node:total_cost:sum15m + expr: | + sum_over_time(node:total_cost:sum[15m:15s]) + - record: node_type:total_cost:sum15m + expr: | + sum_over_time(node_type:total_cost:sum[15m:15s]) + - record: node_type:uptime_hours:sum15m + expr: | + sum_over_time(node_type:uptime_hours:sum[15m:15s]) +--- +# Source: dataplane/templates/propeller/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-propeller-config + namespace: union +data: + admin.yaml: | + admin: + clientId: 'test-client' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///union.test.union.ai + insecure: false + event: + capacity: 1000 + rate: 500 + type: admin + catalog.yaml: | + catalog-cache: + cache-endpoint: dns:///union.test.union.ai + endpoint: dns:///union.test.union.ai + insecure: false + type: fallback + use-admin-auth: true + copilot.yaml: | + plugins: + k8s: + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + core.yaml: | + propeller: + downstream-eval-duration: 30s + enable-admin-launcher: true + leader-election: + enabled: true + lease-duration: 15s + lock-config-map: + name: propeller-leader + namespace: 'union' + renew-deadline: 10s + retry-period: 2s + limit-namespace: all + literal-offloading-config: + enabled: true + max-workflow-retries: 30 + metadata-prefix: metadata/propeller + metrics-prefix: flyte + prof-port: 10254 + queue: + batch-size: -1 + batching-interval: 2s + queue: + base-delay: 5s + capacity: 1000 + max-delay: 120s + rate: 100 + type: maxof + sub-queue: + capacity: 100 + rate: 10 + type: bucket + type: batch + rawoutput-prefix: 's3://test-bucket' + workers: 4 + workflow-reeval-duration: 30s + webhook: + certDir: /etc/webhook/certs + embeddedSecretManagerConfig: + imagePullSecrets: + enabled: true + k8sConfig: + namespace: 'union' + type: 'K8s' + listenPort: '9443' + secretManagerTypes: + - Embedded + - K8s + serviceName: flyte-pod-webhook + servicePort: '443' + enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + logger.yaml: | + logger: + level: 4 + show-source: true + resource_manager.yaml: | + propeller: + resourcemanager: + type: noop + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + dynamic-log-links: + - vscode: + displayName: VS Code Debugger + templateUris: + - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ + kubernetes-enabled: false + templates: + - displayName: Task Logs + scheme: TaskExecution + templateUris: + - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true + storage.yaml: | + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +# Source: dataplane/charts/opencost/templates/clusterrole.yaml +# Cluster role giving opencost to get, list, watch required resources +# No write permissions are required +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] + resources: + - configmaps + - deployments + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - deployments + - daemonsets + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - get + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-clustersync-resource +rules: + - apiGroups: + - "" + - rbac.authorization.k8s.io + resources: + - configmaps + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - clusterrolebindings + - podtemplates + verbs: + - '*' +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + # Allow Access to all resources under flyte.lyft.com + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - nonResourceURLs: + - /metrics + verbs: + - get +--- +# Source: dataplane/templates/prometheus/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-operator-prometheus + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - pods + - endpoints + - services + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + - /metrics/cadvisor + verbs: + - get +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flytepropeller-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: flytepropeller-role +rules: + # Allow RO access to PODS + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + # Allow Event recording access + - apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch + # Allow Access All plugin objects + - apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + # Allow Access to CRD + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update + # Allow Access to all resources under flyte.lyft.com + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: fluentbit-system + namespace: union +--- +# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + name: release-name-kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-kube-state-metrics +subjects: +- kind: ServiceAccount + name: release-name-kube-state-metrics + namespace: union +--- +# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-opencost +subjects: + - kind: ServiceAccount + name: release-name-opencost + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-clustersync-resource +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-clustersync-resource +subjects: + - kind: ServiceAccount + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-clustersync-auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: + - kind: ServiceAccount + name: union-clustersync-system + namespace: union +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-executor +subjects: +- kind: ServiceAccount + name: executor + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: proxy-system +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/templates/prometheus/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-operator-prometheus + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-operator-prometheus +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Create a binding from Role -> ServiceAccount +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flytepropeller-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flytepropeller-webhook-role +subjects: + - kind: ServiceAccount + name: flytepropeller-webhook-system + namespace: union +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: flytepropeller-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flytepropeller-role +subjects: + - kind: ServiceAccount + name: flytepropeller-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + - deployments + verbs: + - get + - list + - watch + - create + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system-secret +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/charts/fluentbit/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 2020 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + annotations: + prometheus.io/scrape: 'true' +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/opencost/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + selector: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + type: "ClusterIP" + ports: + - name: http + port: 9003 + targetPort: 9003 +--- +# Source: dataplane/templates/clusterresourcesync/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: syncresources + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/imagebuilder/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 1234 + targetPort: tcp + protocol: TCP + name: tcp + selector: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/nodeexecutor/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-dataplane-executor + labels: + platform.union.ai/prometheus-group: "union-services" + app: executor +spec: + type: ClusterIP + ports: + - port: 15605 + targetPort: 15605 + protocol: TCP + name: fasttask + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app: executor +--- +# Source: dataplane/templates/operator/service-proxy.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-proxy + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/operator/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/prometheus/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 9090 + protocol: TCP + name: http + selector: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/propeller/service-webhook.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyte-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + projectcontour.io/upstream-protocol.h2c: grpc +spec: + selector: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: https + protocol: TCP + port: 443 + targetPort: 9443 + - name: debug + protocol: TCP + port: 10254 + targetPort: 10254 +--- +# Source: dataplane/templates/propeller/service-webhook.yaml +# Headless Service for cache invalidation — resolves to all pod IPs so that +# we can fan out invalidation requests to every webhook replica. +apiVersion: v1 +kind: Service +metadata: + name: flyte-pod-webhook-headless + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + selector: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: cache-internal + protocol: TCP + port: 9443 + targetPort: 9443 +--- +# Source: dataplane/templates/propeller/service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: union + name: flytepropeller + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: debug + protocol: TCP + port: 10254 + - name: fasttask + port: 15605 + protocol: TCP + targetPort: 15605 + selector: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/fluentbit/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + annotations: + checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + spec: + serviceAccountName: fluentbit-system + hostNetwork: false + dnsPolicy: ClusterFirst + containers: + - name: fluentbit + image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8" + imagePullPolicy: IfNotPresent + command: + - /fluent-bit/bin/fluent-bit + args: + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + ports: + - name: http + containerPort: 2020 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: /api/v1/health + port: http + volumeMounts: + - name: config + mountPath: /fluent-bit/etc/conf + - mountPath: /var/log + name: varlog + - mountPath: /var/lib/docker/containers + name: varlibdockercontainers + readOnly: true + - mountPath: /etc/machine-id + name: etcmachineid + readOnly: true + volumes: + - name: config + configMap: + name: fluentbit-system + - hostPath: + path: /var/log + name: varlog + - hostPath: + path: /var/lib/docker/containers + name: varlibdockercontainers + - hostPath: + path: /etc/machine-id + type: File + name: etcmachineid + tolerations: + - operator: Exists +--- +# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kube-state-metrics-5.30.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.15.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: release-name-kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: kube-state-metrics + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: dataplane/charts/opencost/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: release-name-opencost + containers: + - name: release-name-opencost + image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 + imagePullPolicy: IfNotPresent + args: + ports: + - containerPort: 9003 + name: http + resources: + limits: + cpu: 1000m + memory: 4Gi + requests: + cpu: 500m + memory: 1Gi + startupProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 30 + livenessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 20 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + env: + - name: LOG_LEVEL + value: info + - name: CUSTOM_COST_ENABLED + value: "false" + - name: KUBECOST_NAMESPACE + value: union + - name: API_PORT + value: "9003" + - name: PROMETHEUS_SERVER_ENDPOINT + value: "http://union-operator-prometheus.union.svc:80/prometheus" + - name: CLUSTER_ID + value: "default-cluster" + - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS + value: "15" + - name: CLOUD_COST_ENABLED + value: "false" + - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL + value: "6" + - name: CLOUD_COST_REFRESH_RATE_HOURS + value: "6" + - name: CLOUD_COST_QUERY_WINDOW_DAYS + value: "7" + - name: CLOUD_COST_RUN_WINDOW_DAYS + value: "3" + # Add any additional provided variables +--- +# Source: dataplane/templates/clusterresourcesync/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-syncresources + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "b511750d960c272bb6a4f3ddbbfd46cfcaf0f7dfa7c3e4348c14af517722b00" + + labels: + + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - clusterresource + - --config + - /etc/flyte/config/*.yaml + - clusterresource + - run + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + name: sync-cluster-resources + resources: + limits: + cpu: "1" + memory: 500Mi + requests: + cpu: 500m + memory: 100Mi + volumeMounts: + - name: auth + mountPath: /etc/union/secret + - name: resource-templates + mountPath: /etc/flyte/clusterresource/templates + - name: config-volume + mountPath: /etc/flyte/config + ports: + - name: debug + containerPort: 10254 + protocol: TCP + serviceAccountName: union-clustersync-system + volumes: + - configMap: + name: union-clusterresource-template + name: resource-templates + - configMap: + name: union-clusterresourcesync-config + name: config-volume + - name: auth + secret: + secretName: union-secret-auth + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - flyte + - serve + - connector + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" + imagePullPolicy: "IfNotPresent" + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "10" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector + + nodeSelector: + + app_pool: connectors + tolerations: + + - effect: NoSchedule + key: connectors + operator: Exists +--- +# Source: dataplane/templates/imagebuilder/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + container.apparmor.security.beta.kubernetes.io/buildkit: unconfined + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: "union-imagebuilder" + containers: + - name: "buildkit" + image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - mountPath: /home/user/.local/share/buildkit + name: buildkitd + - mountPath: /etc/buildkit + name: buildkit-config + args: + - --config + - /etc/buildkit/buildkitd.toml + - --addr + - unix:///run/user/1000/buildkit/buildkitd.sock + - --addr + - tcp://0.0.0.0:1234 + - --oci-worker-no-process-sandbox + ports: + - name: tcp + containerPort: 1234 + protocol: TCP + readinessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + livenessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + securityContext: + seccompProfile: # Needs Kubernetes >= 1.19 + type: Unconfined + runAsUser: 1000 + runAsGroup: 1000 + resources: + requests: + cpu: 1 + ephemeral-storage: 20Gi + memory: 1Gi + volumes: + - name: buildkitd + emptyDir: {} + - configMap: + name: union-operator-buildkit + name: buildkit-config + + nodeSelector: + app_pool: flyte + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" + + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/nodeexecutor/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executor + namespace: union + labels: + app: executor +spec: + replicas: 1 + selector: + matchLabels: + app: executor + template: + metadata: + annotations: + configChecksum: "3d931e5636192b94c904aa780a60effc2bb71861f72f22b448e711b33d41918" + + labels: + + app: executor + spec: + securityContext: + fsGroup: 1337 + serviceAccountName: executor + volumes: + - name: config-volume + configMap: + name: executor + - name: secret-volume + secret: + secretName: union-secret-auth + - name: auth + secret: + secretName: union-secret-auth + containers: + - name: executor + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + command: + - executor + - serve + - --config + - /etc/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + resources: + limits: + cpu: "4" + memory: "8Gi" + requests: + cpu: "1" + memory: "1Gi" + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: secret-volume + mountPath: /etc/union/secret + - name: auth + mountPath: /etc/secrets/ + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/operator/deployment-proxy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-proxy + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad" + + labels: + + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + volumes: + - name: config-volume + projected: + sources: + - configMap: + name: union-operator + - configMap: + name: union-clusterresourcesync-config + - name: secret-volume + secret: + secretName: union-secret-auth + serviceAccountName: proxy-system + securityContext: + {} + containers: + - name: operator-proxy + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + args: + - operator + - proxy + - --config + - /etc/union/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: connect + containerPort: 8080 + protocol: TCP + - name: grpc + containerPort: 8081 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + - name: "tunnel" + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + args: + - cloudflared + - tunnel + - --no-autoupdate + - run + - --token + - $(TUNNEL_TOKEN) + env: + - name: TUNNEL_TOKEN + valueFrom: + secretKeyRef: + name: union-secret-auth + key: tunnel_token + optional: true + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/operator/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "d0aafb0ca0dd6f6ea74bab040527389351478d4be3142e010fa62874ea62dad" + + labels: + + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + serviceAccountName: operator-system + securityContext: + {} + volumes: + - name: config-volume + configMap: + name: union-operator + - name: secret-volume + secret: + secretName: union-secret-auth + containers: + - name: operator + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + args: + - operator + - serve + - --config + - /etc/union/config/*.yaml + - --operator.clusterId.name + - "$(CLUSTER_NAME)" + - --operator.tunnel.k8sSecretName + - union-secret-auth + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/prometheus/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-prometheus + namespace: union + labels: + helm.sh/chart: dataplane-2026.3.12 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.3.9" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "9505483b28e45abfefda9a9791a7719382b61225386ddfbdfea71a459a1423e" + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + spec: + priorityClassName: system-cluster-critical + serviceAccountName: union-operator-prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: prometheus + image: "prom/prometheus:v3.3.1" + args: + - --config.file=/etc/prometheus/prometheus.yml + - --web.external-url=/prometheus/ + - --web.route-prefix=/prometheus/ + - --storage.tsdb.retention.time=3d + ports: + - name: http + containerPort: 9090 + protocol: TCP + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + volumes: + - name: prometheus-config + configMap: + name: union-operator-prometheus + + nodeSelector: + + app_pool: monitoring + tolerations: + + - effect: NoSchedule + key: monitoring + operator: Equal + value: "true" +--- +# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Create the actual deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flytepropeller-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e" + + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + serviceAccountName: flytepropeller-webhook-system + initContainers: + - name: generate-secrets + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - init-certs + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + containers: + - name: webhook + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + ports: + - containerPort: 9443 + - containerPort: 10254 + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + readOnly: true + - name: webhook-certs + mountPath: /etc/webhook/certs + readOnly: true + volumes: + - name: config-volume + configMap: + name: flyte-propeller-config + - name: webhook-certs + secret: + secretName: flyte-pod-webhook + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/propeller/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: union + name: flytepropeller + labels: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "3547b02950188e2d00988cfa7366bf0853b0ec87f9867e20e1946c4b414829e" + + labels: + + + app.kubernetes.io/name: flytepropeller + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + priorityClassName: system-cluster-critical + containers: + - command: + - flytepropeller + - --config + - /etc/flyte/config/*.yaml + - --propeller.cluster-id + - union-test + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.3.9" + imagePullPolicy: "IfNotPresent" + name: flytepropeller + ports: + - containerPort: 10254 + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + - name: auth + mountPath: /etc/union/secret + serviceAccountName: flytepropeller-system + volumes: + - configMap: + name: flyte-propeller-config + name: config-volume + - name: auth + secret: + secretName: union-secret-auth + + nodeSelector: + app_pool: flyte + tolerations: + - effect: NoSchedule + key: flyte + operator: Equal + value: "true" +--- +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "release-name-fluentbit-test-connection" + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm + annotations: + helm.sh/hook: test + helm.sh/hook-delete-policy: hook-succeeded +spec: + containers: + - name: wget + image: "busybox:latest" + imagePullPolicy: Always + command: ["sh"] + args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"] + restartPolicy: Never diff --git a/tests/values/dataplane.global-scheduling.yaml b/tests/values/dataplane.global-scheduling.yaml new file mode 100644 index 00000000..42d8aee4 --- /dev/null +++ b/tests/values/dataplane.global-scheduling.yaml @@ -0,0 +1,42 @@ +# Test that global scheduling (nodeSelector, tolerations, affinity) cascades +# to all Union-owned components: propeller, executor, webhook, operator, proxy, +# clusterresourcesync, prometheus, flyteconnector, and imagebuilder/buildkit. + +host: union.test.union.ai +clusterName: union-test +orgName: union +provider: aws + +storage: + provider: aws + authType: iam + bucketName: test-bucket + region: us-east-1 + +secrets: + admin: + create: true + clientSecret: test-secret + clientId: test-client + +# Enable flyteconnector to verify it also gets global scheduling +flyteconnector: + enabled: true + +scheduling: + nodeSelector: + app_pool: flyte + tolerations: + - key: "flyte" + operator: "Equal" + value: "true" + effect: "NoSchedule" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: workload-type + operator: In + values: + - flyte diff --git a/tests/values/dataplane.scheduling-override.yaml b/tests/values/dataplane.scheduling-override.yaml new file mode 100644 index 00000000..684b3d42 --- /dev/null +++ b/tests/values/dataplane.scheduling-override.yaml @@ -0,0 +1,47 @@ +# Test that per-service scheduling overrides take precedence over global scheduling. +# Global scheduling sets app_pool: flyte, but prometheus overrides to app_pool: monitoring. + +host: union.test.union.ai +clusterName: union-test +orgName: union +provider: aws + +storage: + provider: aws + authType: iam + bucketName: test-bucket + region: us-east-1 + +secrets: + admin: + create: true + clientSecret: test-secret + clientId: test-client + +scheduling: + nodeSelector: + app_pool: flyte + tolerations: + - key: "flyte" + operator: "Equal" + value: "true" + effect: "NoSchedule" + +# Per-service overrides should win over global +prometheus: + nodeSelector: + app_pool: monitoring + tolerations: + - key: "monitoring" + operator: "Equal" + value: "true" + effect: "NoSchedule" + +flyteconnector: + enabled: true + nodeSelector: + app_pool: connectors + tolerations: + - key: "connectors" + operator: "Exists" + effect: "NoSchedule"