diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index aecb5405..e37e0cb9 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -1042,7 +1042,7 @@ "y": 33 }, "id": 300, - "title": "FlyteAdmin (V1 + V2)", + "title": "FlyteAdmin", "type": "row", "panels": [ { @@ -1301,7 +1301,7 @@ "y": 34 }, "id": 400, - "title": "Executions (V1 + V2)", + "title": "Executions", "type": "row", "panels": [ { @@ -1807,7 +1807,7 @@ "y": 34 }, "id": 500, - "title": "Queue / Run-Scheduler (V2)", + "title": "Queue / Run-Scheduler", "type": "row", "panels": [ { @@ -2224,7 +2224,7 @@ "y": 35 }, "id": 600, - "title": "Cluster Service (V1 + V2)", + "title": "Cluster Service", "type": "row", "panels": [ { @@ -2299,12 +2299,12 @@ "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -2548,7 +2548,7 @@ "y": 36 }, "id": 900, - "title": "CacheService (V1 + V2)", + "title": "CacheService", "type": "row", "panels": [ { @@ -2581,17 +2581,17 @@ "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -2628,17 +2628,17 @@ "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -2656,397 +2656,566 @@ "y": 36 }, "id": 750, - "title": "Authorizer (V1 + V2)", + "title": "Authorizer", "type": "row", "panels": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" + "id": 760, + "title": "Authorizer Mode", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 15 + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "name", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, - "id": 751, - "title": "Allow / Deny Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Allowed", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", + "legendFormat": "{{type}}", "refId": "A" - }, - { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Denied", - "refId": "B" } ], - "description": "Authorization decision rate. Allow/deny ratio indicates auth health. High deny rate may signal misconfigured policies. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 751, + "title": "Allow / Deny Rate", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 4, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "spanNulls": false }, - "unit": "ms" - } + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*denied.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*allowed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 15 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi" + } }, - "id": 752, - "title": "Authorize Latency", - "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "allowed ({{identity_type}})", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "denied ({{identity_type}})", "refId": "B" - }, - { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "End-to-end Authorize() latency including identity resolution and backend authorization check. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 753, + "title": "Deny Rate (%)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 14, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "unit": "percentunit", "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "percentunit" - } + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "noValue": "0", + "decimals": 1, + "min": 0, + "max": 1 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 15 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 753, - "title": "Deny Rate (%)", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Deny %", + "expr": "(sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)) / clamp_min((sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])) + sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))), 1e-10)", + "legendFormat": "{{identity_type}}", "refId": "A" } ], - "description": "Percentage of authorization decisions that denied access. Spikes indicate policy changes or auth issues. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 752, + "title": "Authorize Latency (service)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" + "unit": "ms", + "custom": { + "drawStyle": "line", + "fillOpacity": 10 }, + "noValue": "0", + "decimals": 1, "thresholds": { + "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 } ] - }, - "mappings": [ - { - "type": "value", - "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } - } - } - ] - } - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 0, - "y": 23 + } + }, + "overrides": [] }, - "id": 760, "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^type$/" + "legend": { + "displayMode": "list", + "placement": "bottom" }, - "textMode": "value" + "tooltip": { + "mode": "multi" + } }, - "title": "Authorizer Mode", - "type": "stat", "targets": [ { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", - "legendFormat": "{{ type }}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", "refId": "A" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Currently active authorizer backend type (Noop, UserClouds, External, Authorizer)." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 761, + "title": "Backend Latency", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "unit": "ms", "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ms" - } + "noValue": "0", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 4, - "y": 23 + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 761, - "title": "External Backend Latency", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p95", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p99", "refId": "C" } ], - "description": "Latency of calls to the external authorization backend (p50/p95/p99)." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 764, + "title": "Decisions by Action", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { + "mode": "normal" + } }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 23 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "sum" + ] + }, + "tooltip": { + "mode": "multi" + } }, - "id": 762, - "title": "External Errors by gRPC Code", - "type": "timeseries", "targets": [ { - "expr": "sum by (grpc_code) (rate(authorizer:authorizer:cloudauthorizer:connect:external:errors{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_code }}", + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (allowed)", "refId": "A" + }, + { + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (denied)", + "refId": "B" } ], - "description": "Error rate from the external authorization backend, broken down by gRPC status code." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 762, + "title": "Backend Errors", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 23 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 763, - "title": "Fail-Open Activations", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Fail-Open", + "expr": "sum by (error_type) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_type}}", "refId": "A" } ], - "description": "Rate of fail-open activations. Non-zero means the external backend is unreachable and requests are being allowed without authorization." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 765, + "title": "Error Attribution", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } + "fillOpacity": 10 }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 31 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 764, - "title": "Decisions by Action", - "type": "timeseries", "targets": [ { - "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "allowed: {{ action }}", + "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_source}}", "refId": "A" - }, - { - "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "denied: {{ action }}", - "refId": "B" } ], - "description": "Authorization decisions broken down by action (e.g. read, write, execute). Stacked to show total volume." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 763, + "title": "Fail-Open Activations", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ops" - } + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 31 + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 765, - "title": "Error Attribution", - "type": "timeseries", "targets": [ { - "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ error_source }}", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval]) or vector(0)", + "legendFormat": "fail-open", "refId": "A" } ], - "description": "Authorization errors attributed by source (e.g. identity resolution, backend, policy evaluation)." + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ] }, @@ -3366,17 +3535,17 @@ "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -3566,4 +3735,4 @@ "title": "Union Controlplane Overview", "uid": "union-cp-overview", "version": 2 -} +} \ No newline at end of file diff --git a/charts/controlplane/templates/authz/configmap.yaml b/charts/controlplane/templates/authz/configmap.yaml index 37fdda2f..5dbf8651 100644 --- a/charts/controlplane/templates/authz/configmap.yaml +++ b/charts/controlplane/templates/authz/configmap.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/controlplane/templates/authz/deployment.yaml b/charts/controlplane/templates/authz/deployment.yaml index d93e44a6..d4419f40 100644 --- a/charts/controlplane/templates/authz/deployment.yaml +++ b/charts/controlplane/templates/authz/deployment.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/charts/controlplane/templates/authz/hpa.yaml b/charts/controlplane/templates/authz/hpa.yaml index 9f8de177..41fe2809 100644 --- a/charts/controlplane/templates/authz/hpa.yaml +++ b/charts/controlplane/templates/authz/hpa.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.autoscaling.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.autoscaling.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: diff --git a/charts/controlplane/templates/authz/networkpolicy.yaml b/charts/controlplane/templates/authz/networkpolicy.yaml index 3105c0c6..87d6e18e 100644 --- a/charts/controlplane/templates/authz/networkpolicy.yaml +++ b/charts/controlplane/templates/authz/networkpolicy.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.networkPolicy.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.networkPolicy.enabled }} apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: diff --git a/charts/controlplane/templates/authz/pdb.yaml b/charts/controlplane/templates/authz/pdb.yaml index 42e1a421..71bb7229 100644 --- a/charts/controlplane/templates/authz/pdb.yaml +++ b/charts/controlplane/templates/authz/pdb.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.pdb.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.pdb.enabled }} apiVersion: policy/v1 kind: PodDisruptionBudget metadata: diff --git a/charts/controlplane/templates/authz/rbac.yaml b/charts/controlplane/templates/authz/rbac.yaml index d681d292..bd7fbf8d 100644 --- a/charts/controlplane/templates/authz/rbac.yaml +++ b/charts/controlplane/templates/authz/rbac.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.serviceAccount.create -}} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.serviceAccount.create -}} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: diff --git a/charts/controlplane/templates/authz/service.yaml b/charts/controlplane/templates/authz/service.yaml index 79cfa05f..a969eddc 100644 --- a/charts/controlplane/templates/authz/service.yaml +++ b/charts/controlplane/templates/authz/service.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: v1 kind: Service metadata: diff --git a/charts/controlplane/templates/authz/serviceaccount.yaml b/charts/controlplane/templates/authz/serviceaccount.yaml index 2b3fab43..971dc7fc 100644 --- a/charts/controlplane/templates/authz/serviceaccount.yaml +++ b/charts/controlplane/templates/authz/serviceaccount.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.serviceAccount.create -}} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.serviceAccount.create -}} apiVersion: v1 kind: ServiceAccount metadata: diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index cdf885ca..c8fb66f5 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -108,20 +108,12 @@ global: DATAPLANE_ENDPOINT: "" # --- Authentication Configuration --- - # Set all values below to enable OIDC authentication. - # Supports any OAuth2/OIDC provider (Okta, Azure AD, Auth0, Keycloak, etc.) - # - # OIDC issuer URL - # Example: "https://dev-123456.okta.com/oauth2/default" - OIDC_BASE_URL: "" - # Flyteadmin OIDC client ID for browser login flow - # Example: "0oa1abc2def3ghi4j5k6" - OIDC_CLIENT_ID: "" - # CLI client ID for flytectl / uctl (public OAuth app, PKCE flow) - # Example: "0oa7mno8pqr9stu0v1w2" - CLI_CLIENT_ID: "" - # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. - # Set them in your environment-specific overlay (Terraform-generated values). + # All OIDC/OAuth2 globals are defined in the base values.yaml with documentation. + # Set them in your environment-specific values overlay generated by Terraform. + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are also in the base values.yaml. + # Set them in your environment-specific values overlay. + # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). + # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. # ---------------------------------------------------------------------------- # SECTION 2: Image Tag Overrides @@ -267,36 +259,8 @@ flyte: # Subject to removal in the future singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' - # --- OIDC Authentication --- - # To enable authentication, set server.security.useAuth: true - # and configure the auth globals in Section 1 above. - # server: - # security: - # useAuth: true - auth: - httpAuthorizationHeader: "flyte-authorization" - grpcAuthorizationHeader: "flyte-authorization" - authorizedUris: - - "http://flyteadmin:80" - - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' - appAuth: - authServerType: "External" - externalAuthServer: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - thirdPartyConfig: - flyteClient: - clientId: '{{ .Values.global.CLI_CLIENT_ID }}' - redirectUri: "http://localhost:53593/callback" - scopes: ["all"] - userAuth: - openId: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' - scopes: ["profile", "openid", "offline_access"] - cookieSetting: - sameSitePolicy: "LaxMode" - domain: "" - idpQueryParameter: "idp" + # adminServer.auth is now fully configured in the base values.yaml + # using globals. No overlay-specific auth config needed. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: null so the subchart does NOT create the secret — it must be @@ -364,10 +328,22 @@ ingress: - "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" secretName: "{{ .Values.global.TLS_SECRET_NAME }}" + # --- Ingress Annotations (shared across all ingress objects) --- + annotations: + # Allow the nginx controller's internal DNS to match ingress rules so that + # intra-cluster traffic (DP → CP via nginx service DNS) is routed through + # the same auth subrequest as external traffic. Without this, the :authority + # header won't match the ingress host and auth is bypassed. + nginx.ingress.kubernetes.io/server-alias: "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" + # --- Protected Ingress Auth Annotations --- # These configure nginx to validate requests via flyteadmin's /me endpoint # and redirect unauthenticated users to /login for the OIDC flow. # Active when OIDC authentication is enabled (server.security.useAuth: true). + # + # All protected endpoints use "https://$host/me" so the auth subrequest goes + # through nginx itself. This ensures verifyClaims runs on the access token, + # which resolves identitytype for all callers (browser, CLI, service-to-service). protectedIngressAnnotations: nginx.ingress.kubernetes.io/auth-url: "https://$host/me" nginx.ingress.kubernetes.io/auth-signin: "https://$host/login?redirect_url=$escaped_request_uri" @@ -482,23 +458,6 @@ services: # Connect to dataplane ingress controller secureTunnelTenantURLPattern: '{{ .Values.global.DATAPLANE_ENDPOINT }}' - # Executions service configuration - executions: - configMap: - executions: - app: - adminClient: - connection: - # Flyteadmin endpoint for executions service - endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' - insecure: true - # --- Auth fields (active when OIDC is enabled) --- - authorizationHeader: "flyte-authorization" - clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' - clientSecretLocation: "/etc/secrets/union/client_secret" - tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: ["all"] - # ---------------------------------------------------------------------------- # Monitoring Configuration (AWS/EKS specific) # ---------------------------------------------------------------------------- diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index eb93fbf0..4dba7891 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -116,20 +116,10 @@ global: IMAGE_REPOSITORY_PREFIX: "registry.unionai.cloud/controlplane" # --- Authentication Configuration --- - # Set all values below to enable OIDC authentication. - # Supports any OAuth2/OIDC-compliant identity provider. - # - # OIDC issuer URL - # Example: "https://login.example.com/oauth2/default" - OIDC_BASE_URL: "" - # Flyteadmin OIDC client ID for browser login flow - # Example: "0oa1abc2def3ghi4j5k6" - OIDC_CLIENT_ID: "" - # CLI client ID for flytectl / uctl (public OAuth app, PKCE flow) - # Example: "0oa7mno8pqr9stu0v1w2" - CLI_CLIENT_ID: "" - # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. - # Set them in your environment-specific overlay (Terraform-generated values). + # All OIDC/OAuth2 globals are defined in the base values.yaml with documentation. + # Set them in your environment-specific values overlay generated by Terraform. + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are also in the base values.yaml. + # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. # ---------------------------------------------------------------------------- # SECTION 2: Image Tag Overrides @@ -297,36 +287,8 @@ flyte: # Subject to removal in the future singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' - # --- OIDC Authentication --- - # To enable authentication, set server.security.useAuth: true - # and configure the auth globals in Section 1 above. - # server: - # security: - # useAuth: true - auth: - httpAuthorizationHeader: "flyte-authorization" - grpcAuthorizationHeader: "flyte-authorization" - authorizedUris: - - "http://flyteadmin:80" - - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' - appAuth: - authServerType: "External" - externalAuthServer: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - thirdPartyConfig: - flyteClient: - clientId: '{{ .Values.global.CLI_CLIENT_ID }}' - redirectUri: "http://localhost:53593/callback" - scopes: ["all"] - userAuth: - openId: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' - scopes: ["profile", "openid", "offline_access"] - cookieSetting: - sameSitePolicy: "LaxMode" - domain: "" - idpQueryParameter: "idp" + # adminServer.auth is now fully configured in the base values.yaml + # using globals. No overlay-specific auth config needed. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: "placeholder" so the subchart renders the secret — it must be @@ -394,7 +356,14 @@ ingress: - "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" secretName: "{{ .Values.global.TLS_SECRET_NAME }}" - # Protected ingress auth annotations are now defined in the base values.yaml. + # --- Ingress Annotations (shared across all ingress objects) --- + annotations: + # Allow the nginx controller's internal DNS to match ingress rules so that + # intra-cluster traffic (DP → CP via nginx service DNS) is routed through + # the same auth subrequest as external traffic. + nginx.ingress.kubernetes.io/server-alias: "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" + + # Protected ingress auth annotations are defined in the base values.yaml. # Override here only if you need to customize auth behavior for this deployment mode. # ---------------------------------------------------------------------------- @@ -496,23 +465,6 @@ services: # Connect to dataplane ingress controller secureTunnelTenantURLPattern: '{{ .Values.global.DATAPLANE_ENDPOINT }}' - # Executions service configuration - executions: - configMap: - executions: - app: - adminClient: - connection: - # Flyteadmin endpoint for executions service - endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' - insecure: true - # --- Auth fields (active when OIDC is enabled) --- - authorizationHeader: "flyte-authorization" - clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' - clientSecretLocation: "/etc/secrets/union/client_secret" - tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: ["all"] - # ---------------------------------------------------------------------------- # SECTION 9: ScyllaDB Configuration # ---------------------------------------------------------------------------- diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 9c17bb22..4c983ecc 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -54,15 +54,37 @@ global: # Ingress controller provider. Options: "nginx", "envoy", "both" INGRESS_PROVIDER: nginx - # OAuth2 client ID for service-to-service authentication (client_credentials flow). - # Services use this to acquire tokens for internal calls through nginx. - # Example: "0oa3xyz4abc5def6g7h8" + # OAuth2 client ID for service-to-service authentication (client_credentials grant). + # Used by controlplane services (executions, queue, cluster) to acquire tokens + # for internal calls through nginx. This is OAuth App 3 ("internal/service-to-service") + # in the authentication architecture. + # Okta example: "0oa3xyz4abc5def6g7h8" + # Entra ID example: "dc0ea3fc-f32b-4df4-98c1-3681e5a36bc6" INTERNAL_CLIENT_ID: "" - # OAuth2 token endpoint for service-to-service authentication. - # Example: "https://dev-123456.okta.com/oauth2/default/v1/token" + # OAuth2 token endpoint URL for service-to-service authentication. + # Used with INTERNAL_CLIENT_ID for client_credentials token acquisition. + # Okta example: "https://dev-123456.okta.com/oauth2/default/v1/token" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/oauth2/v2.0/token" AUTH_TOKEN_URL: "" + # --- OIDC / OAuth2 Authentication --- + # Auth configuration is in flyte.configmap.adminServer.auth (see below). + # Set auth fields directly in that block or via your values overlay. + # The globals below are kept for backward compatibility only. + + # Deprecated: set flyte.configmap.adminServer.auth.appAuth.externalAuthServer.baseUrl instead. + OIDC_BASE_URL: "" + # Deprecated: set flyte.configmap.adminServer.auth.userAuth.openId.clientId instead. + OIDC_CLIENT_ID: "" + # Deprecated: set flyte.configmap.adminServer.auth.appAuth.thirdPartyConfig.flyteClient.clientId instead. + CLI_CLIENT_ID: "" + + # OAuth2 scope for service-to-service authentication (client_credentials grant). + # Used by configMap.union.auth and executions.adminClient.connection (S2S concern). + # Okta: leave empty (defaults to "all"). Entra ID: "api://my-app-name/.default" + OIDC_S2S_SCOPE: "" + # ---------------------------------------------------------------------------- # Additional Configuration # ---------------------------------------------------------------------------- @@ -248,6 +270,31 @@ ingress: nginx.ingress.kubernetes.io/auth-url: "http://flyteadmin.{{ template \"flyte.namespace\" . }}.svc.cluster.local/me" nginx.ingress.kubernetes.io/auth-response-headers: "Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token" nginx.ingress.kubernetes.io/auth-cache-key: "$http_authorization$http_flyte_authorization$http_cookie" + # For gRPC backends (backend-protocol: GRPC), nginx uses grpc_pass instead + # of proxy_pass. The auth-response-headers annotation only sets proxy headers, + # not gRPC headers. This configuration-snippet bridges identity headers from + # the auth subrequest response into the upstream gRPC request so backend + # services receive the caller's identity. + nginx.ingress.kubernetes.io/configuration-snippet: | + auth_request_set $user_id $upstream_http_x_user_subject; + proxy_set_header X-User-Subject $user_id; + grpc_set_header X-User-Subject $user_id; + + auth_request_set $user_identitytype $upstream_http_x_user_claim_identitytype; + proxy_set_header X-User-Claim-Identitytype $user_identitytype; + grpc_set_header X-User-Claim-Identitytype $user_identitytype; + + auth_request_set $user_handle $upstream_http_x_user_claim_userhandle; + proxy_set_header X-User-Claim-userhandle $user_handle; + grpc_set_header X-User-Claim-userhandle $user_handle; + + auth_request_set $groups $upstream_http_x_user_claim_groups; + proxy_set_header X-User-Claim-groups $groups; + grpc_set_header X-User-Claim-groups $groups; + + more_set_headers "x-request-id: $request_id"; + proxy_set_header x-request-id $request_id; + grpc_set_header x-request-id $request_id; envoyGateway: # GatewayClass name for Envoy Gateway. Used when INGRESS_PROVIDER is "envoy" or "both". @@ -256,7 +303,7 @@ envoyGateway: enabled: false requestsPerUnit: 100 unit: Second - + # -- Central logging configuration. All controlplane services pull their log level from here. # Go services use level 1–6 (1=least verbose, 6=most verbose; 4=INFO, 6=DEBUG). # Log format options: json, text, gcp @@ -309,6 +356,14 @@ configMap: legacyHosts: - '{{ .Values.global.UNION_ORG }}' union: + connection: + # Overridden by terraform from authn module trusted_identity_claims output. + # Okta: externalIdentityClaim = internal client_id (sub == client_id) + # Entra ID: externalIdentityClaim = Service Principal Object ID + trustedIdentityClaims: + enabled: true + externalIdentityClaim: "" + externalIdentityTypeClaim: "app" internalConnectionConfig: enabled: true urlPattern: "_SERVICE_.{{ .Release.Namespace }}.svc.cluster.local:80" @@ -320,7 +375,7 @@ configMap: tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' authorizationMetadataKey: flyte-authorization scopes: - - all + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' services: artifacts: @@ -412,20 +467,9 @@ services: # # Supported types: # - "Noop" — no enforcement (default) - # - "UserClouds" — Union Cloud's authorization backend + # - "UserClouds" — Union RBAC (just set type, defaults are pre-configured) # - "External" — customer-provided gRPC authorization server (selfhosted) # - # --- Union Cloud (UserClouds) --- - # For Union Cloud deployments, set type to "UserClouds": - # authorizer: - # type: "UserClouds" - # userCloudsClient: - # tenantUrl: 'http://{{ .Release.Name }}-union-authz.{{ .Release.Namespace }}.svc.cluster.local:8080' - # tenantID: '623771e7-ddd6-4575-bedb-7c970ec75b87' - # clientID: '{{ .Values.union.authz.clientID }}' - # clientSecretName: 'union/client_secret' - # enableLogging: true - # # --- External Authorization (selfhosted) --- # For selfhosted deployments with a customer-provided authz server: # authorizer: @@ -467,6 +511,31 @@ services: forwardHeaders: - authorization - flyte-authorization + # --- UserClouds client defaults (pre-configured) --- + # These defaults are used when type is set to "UserClouds" (Union RBAC). + # They are ignored when type is "Noop" or "External". + # To enable Union RBAC, just change type to "UserClouds" — no other + # configuration is needed. Override individual fields only if your + # deployment uses non-standard naming or secrets. + userCloudsClient: + tenantUrl: 'http://{{ .Release.Name }}-union-authz.{{ .Release.Namespace }}.svc.cluster.local:8080' + tenantID: '623771e7-ddd6-4575-bedb-7c970ec75b87' + clientID: '{{ .Values.union.authz.clientID }}' + clientSecretName: 'union/client_secret' + enableLogging: true + internalCommunicationConfig: + enabled: false + bootstrap: + organization: "" + domains: + - development + - staging + - production + projects: [] + serviceAccounts: [] + adminUsers: [] + retryInterval: 5s + maxRetries: 30 sharedService: connectPort: 8081 metrics: @@ -522,6 +591,38 @@ services: secureTunnelTenantURLPattern: http://ingress-nginx-internal.ingress-nginx.svc.cluster.local:80 # http://ingress-nginx-internal.ingress-nginx.svc.cluster.local clusterSelector: type: local + organizations: + fullnameOverride: "organizations" + sharedService: + connectPort: 8081 + initContainers: + - name: migrate + args: + - cloudorganizations + - migrate + - --config + - "/etc/config/*.yaml" + args: + - cloudorganizations + - serve + - --config + - /etc/config/*.yaml + configMap: + sharedService: + connectPort: 8081 + metrics: + scope: "organizations:" + db: + dbname: '{{ .Values.global.DB_NAME }}' + host: '{{ .Values.global.DB_HOST }}' + username: '{{ .Values.global.DB_USER }}' + passwordPath: /etc/db/pass.txt + port: 5432 + connectionPool: + maxIdleConnections: 20 + maxOpenConnections: 20 + maxConnectionLifetime: 1m + executions: fullnameOverride: "executions" initContainers: @@ -560,18 +661,19 @@ services: eventsProxy: recorderType: RunService executions: - - # app: - # adminClient: - # connection: - # -- Override rootTenantURLPattern for adminClient to point to control plane service. - # endpoint: "" - - # -- Insecure should be true only for local testing with self-signed certs. - # insecure: true|false - - # -- Skip TLS verification for self-signed certs. Should be true only for local testing. - # insecureSkipVerify: true|false + app: + adminClient: + connection: + # TODO(FAB-195): Replace FLYTEADMIN_ENDPOINT with CONTROLPLANE_INTRA_CLUSTER_HOST + # so all S2S traffic routes through ingress (auth subrequests, TLS, etc.). + endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' + insecure: true + authorizationHeader: "flyte-authorization" + clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' + clientSecretLocation: "/etc/secrets/union/client_secret" + tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' apps: # Enrich "Owned By" display with name/email from a remote identity service. @@ -1193,24 +1295,90 @@ flyte: # template: "{{ project }}-{{ domain }}" adminServer: - # --- Subject Claim Resolution (FAB-205) --- - # When flyteadmin validates a JWT access token, it resolves the caller's subject - # from JWT claims. By default it reads the standard "sub" claim. Some IdPs omit - # "sub" from client credentials tokens or use a different claim for the client - # identity (e.g. "client_id"). - # - # When subjectClaimNames is configured under auth.appAuth, it becomes the - # authoritative ordered list of claims to try. The first non-empty value wins - # and is used as the subject for all downstream identity resolution (including - # /me, authorization middleware, and internal service-to-service calls). - # - # When not configured, the standard "sub" claim is used (default behavior). + server: + security: + useAuth: true + + # --- OIDC Authentication Configuration --- + # Configure your identity provider below. Set values directly or via + # your values overlay. For Union-managed deployments, the authn Terraform + # module generates this block automatically. # - # auth: - # appAuth: - # subjectClaimNames: - # - sub - # - client_id # Common alternative for client credentials tokens + # Supported IdPs: Okta, Entra ID (Azure AD), Keycloak, Authentik. + # See unionai-docs for provider-specific setup guides. + auth: + httpAuthorizationHeader: "flyte-authorization" + grpcAuthorizationHeader: "flyte-authorization" + authorizedUris: + - "http://flyteadmin:80" + - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' + - 'https://{{ .Values.global.UNION_HOST }}' + + appAuth: + authServerType: "External" + externalAuthServer: + # --- OIDC Issuer --- + # Okta example: "https://dev-123456.okta.com/oauth2/default" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" + baseUrl: "" + # Metadata discovery endpoint (relative to baseUrl). + # Okta: ".well-known/oauth-authorization-server" (default) + # Entra ID: ".well-known/openid-configuration" + metadataUrl: ".well-known/oauth-authorization-server" + # Allowed JWT audiences for access token validation. + # Default: ["https://{UNION_HOST}"]. + # Entra ID example: ["api://my-app-name", "f0b2667d-..."] + allowedAudience: + - 'https://{{ .Values.global.UNION_HOST }}' + + # --- Subject claim resolution (FAB-205) --- + # Ordered list of JWT claims to try for caller identity. + # Only needed for IdPs where client_credentials tokens omit "sub". + # Default: uses standard "sub" claim. + # Override in values overlay if your IdP requires fallback claims. + + # --- Identity type claim mapping --- + # Maps IdP-specific claims to internal identitytype. + # Okta: not needed (identitytype claim is native) + # Entra ID example (set in values overlay): + # identityTypeClaimsForApps: + # idtyp: ["app"] + + thirdPartyConfig: + flyteClient: + # --- CLI/SDK PKCE Client --- + # Okta example: "0oa7mno8pqr9stu0v1w2" + # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" + clientId: "" + redirectUri: "http://localhost:53593/callback" + # Resource scope for CLI/SDK and task pod authentication. + # Okta: ["all"] (default) + # Entra ID: ["api://my-app-name/.default"] + scopes: + - "all" + # Audience parameter for authorization requests. + # Okta: "" (not needed) + # Entra ID: "api://my-app-name" + audience: "" + + userAuth: + openId: + # --- Browser Login --- + # Okta example: "0oa1abc2def3ghi4j5k6" + # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" + baseUrl: "" + clientId: "" + # Browser login scopes. + # Okta: ["profile", "openid", "offline_access"] (default) + # Entra ID: ["profile", "openid", "offline_access", "api://my-app/all"] + scopes: + - profile + - openid + - offline_access + cookieSetting: + sameSitePolicy: "LaxMode" + domain: "" + idpQueryParameter: "idp" admin: endpoint: 'dns:///{{ .Values.global.UNION_HOST }}' @@ -1253,7 +1421,7 @@ flyte: populateUserFields: false server: security: - useAuth: false + useAuth: true union: internalConnectionConfig: enabled: true diff --git a/charts/dataplane/values.aws.selfhosted-intracluster.yaml b/charts/dataplane/values.aws.selfhosted-intracluster.yaml index b2a6654d..58c62dad 100644 --- a/charts/dataplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/dataplane/values.aws.selfhosted-intracluster.yaml @@ -109,9 +109,14 @@ global: # Required when the control plane has OIDC enabled. # Supports any OAuth2/OIDC provider (Okta, Azure AD, Auth0, Keycloak, etc.) # - # Service-to-service OAuth client ID (client_credentials flow) + # Service-to-service OAuth client ID (client_credentials flow). + # Uses: OAuth App 4 (Operator). # Example: "0oa3xyz4abc5def6g7h8" AUTH_CLIENT_ID: "" + # OAuth2 scope for service-to-service client_credentials grant. + # Okta: leave empty (defaults to "all"). + # Entra ID: "api://my-app-name/.default" + OIDC_S2S_SCOPE: "" # ---------------------------------------------------------------------------- # SECTION 2: Core Identity Configuration (REQUIRED) @@ -181,6 +186,9 @@ clusterresourcesync: # --- Service-to-service OAuth2 --- # ClusterResourceSync acquires OAuth2 tokens via client_credentials # flow and sends them on outgoing calls to the control plane. + # OAuth2 auth for dataplane → controlplane service-to-service calls. + # Uses: OAuth App 4 (Operator — confidential client, client_credentials grant). + # Flow: Service-to-service (Flow 3). auth: enable: true type: "ClientSecret" @@ -188,6 +196,8 @@ clusterresourcesync: clientSecretLocation: "/etc/union/secret/client_secret" authorizationMetadataKey: "flyte-authorization" tokenRefreshWindow: "5m" + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' # ---------------------------------------------------------------------------- # Core Service Configuration diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 8b3d730d..d2f2cd8e 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -682,12 +682,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization + internalCommunicationConfig: + enabled: false type: Noop useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -3576,12 +3595,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3858,17 +3877,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3905,17 +3924,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3966,12 +3985,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4008,17 +4027,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4055,7 +4074,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4084,10 +4103,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4643,17 +4666,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index 54d471d1..98396771 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -682,12 +682,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization + internalCommunicationConfig: + enabled: false type: Noop useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -3576,12 +3595,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3858,17 +3877,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3905,17 +3924,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3966,12 +3985,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4008,17 +4027,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4055,7 +4074,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4084,10 +4103,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4643,17 +4666,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml new file mode 100644 index 00000000..0ef0c61f --- /dev/null +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -0,0 +1,10458 @@ +--- +# Source: controlplane/templates/scylla/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/operator.pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: scylla-operator + namespace: scylla-operator +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: webhook-server + namespace: scylla-operator +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/console/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: flyteadmin + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyteadmin +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: datacatalog + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: datacatalog +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cacheservice + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: cacheservice +--- +# Source: controlplane/templates/pdb.yaml +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: authorizer +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cluster +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: dataproxy +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: executions +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: queue +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: run-scheduler +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: usage +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm + annotations: + eks.amazonaws.com/role-arn: '' +imagePullSecrets: + - name: union-registry-secret +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union +automountServiceAccountToken: true +--- +# Source: controlplane/charts/scylla-operator/templates/operator.serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scylla-operator + namespace: scylla-operator + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + namespace: scylla-operator + name: webhook-server + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/cacheservice/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cacheservice + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/managed-by: Helm + annotations: + eks.amazonaws.com/role-arn: '' +imagePullSecrets: + - name: union-registry-secret +--- +# Source: controlplane/templates/console/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/charts/flyte/templates/admin/secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: flyte-admin-secrets + namespace: union +type: Opaque +stringData: +--- +# Source: controlplane/charts/flyte/templates/common/secret-auth.yaml +apiVersion: v1 +kind: Secret +metadata: + name: flyte-secret-auth + namespace: union +type: Opaque +stringData: + client_secret: placeholder +--- +# Source: controlplane/charts/flyte/templates/admin/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-admin-clusters-config + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +data: + clusters.yaml: | + clusters: + clusterConfigs: [] + labelClusterMap: {} +--- +# Source: controlplane/charts/flyte/templates/admin/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-admin-base-config + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +data: + db.yaml: | + database: + connMaxLifeTime: 120s + dbname: flyteadmin + host: '' + maxIdleConnections: 10 + maxOpenConnections: 80 + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + domain.yaml: | + domains: + - id: development + name: development + - id: staging + name: staging + - id: production + name: production + otel.yaml: | + otel: + file: + filename: /tmp/trace.txt + jaeger: + endpoint: http://localhost:14268/api/traces + otlpgrpc: + endpoint: http://localhost:4317 + otlphttp: + endpoint: http://localhost:4318/v1/traces + sampler: + parentSampler: always + type: noop + server.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/client_secret + endpoint: dns:/// + insecure: true + auth: + appAuth: + authServerType: External + externalAuthServer: + baseUrl: 'https://idp.example.com/oauth2/v2.0' + metadataUrl: '.well-known/openid-configuration' + identityTypeClaimsForApps: + idtyp: + - app + thirdPartyConfig: + flyteClient: + audience: 'api://my-app' + clientId: '55555555-6666-7777-8888-999999999999' + redirectUri: http://localhost:53593/callback + scopes: + - 'api://my-app/all' + authorizedUris: + - http://flyteadmin:80 + - http://flyteadmin.union.svc.cluster.local:80 + grpcAuthorizationHeader: flyte-authorization + httpAuthorizationHeader: flyte-authorization + userAuth: + cookieSetting: + domain: "" + sameSitePolicy: LaxMode + idpQueryParameter: idp + openId: + baseUrl: 'https://idp.example.com/oauth2/v2.0' + clientId: '00000000-1111-2222-3333-444444444444' + scopes: + - profile + - openid + - offline_access + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cloudEvents: + enable: false + connection: + environment: staging + region: '' + rootTenantURLPattern: dns:/// + flyteadmin: + eventVersion: 2 + metadataStoragePrefix: + - metadata + - admin + metricsKeys: + - phase + metricsScope: 'flyte:' + profilerPort: 10254 + roleNameKey: iam.amazonaws.com/role + useOffloadedInputs: true + useOffloadedWorkflowClosure: true + otel: + type: noop + private: + app: + cacheProviderConfig: + kind: bypass + populateUserFields: false + server: + grpc: + port: 8089 + httpPort: 8088 + security: + allowCors: true + allowedHeaders: + - Content-Type + - flyte-authorization + allowedOrigins: + - '*' + secure: false + useAuth: false + sharedService: + connectPort: 8089 + httpPort: 8088 + port: 8089 + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + internalConnectionConfig: + enabled: true + urlPattern: '_SERVICE_.union.svc.cluster.local:80' + remoteData.yaml: | + remoteData: + region: us-east-1 + scheme: local + signedUrls: + durationMinutes: 3 + storage.yaml: | + storage: + type: s3 + container: "" + connection: + auth-type: iam + region: + enable-multicontainer: false + limits: + maxDownloadMBs: 10 + cache: + max_size_mbs: 1024 + target_gc_percent: 70 + task_resource_defaults.yaml: | + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 2 + gpu: 1 + memory: 1Gi +--- +# Source: controlplane/charts/flyte/templates/console/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-console-config + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm +data: + BASE_URL: /console + CONFIG_DIR: /etc/flyte/config +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +data: + allow-snippet-annotations: "true" + annotations-risk-level: "Critical" + grpc-connect-timeout: "1200" + grpc-read-timeout: "604800" + grpc-send-timeout: "604800" + proxy-connect-timeout: "60" + proxy-read-timeout: "3600" + proxy-send-timeout: "3600" +--- +# Source: controlplane/templates/cacheservice/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cacheservice-config + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/managed-by: Helm +data: + db.yaml: | + database: + connMaxLifeTime: 120s + dbname: cacheservice + host: '' + maxIdleConnections: 10 + maxOpenConnections: 20 + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger.yaml: | + formatter: + type: json + level: 6 + show-source: true + server.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache-server: + grpcPort: 8089 + grpcServerReflection: true + httpPort: 8080 + cacheservice: + heartbeat-grace-period-multiplier: 3 + max-reservation-heartbeat: 30s + metrics-scope: flyte + profiler-port: 10254 + storage-prefix: cached_outputs + otel: + type: noop + private: + app: + cacheProviderConfig: + kind: bypass + union: + internalConnectionConfig: + enabled: true + urlPattern: '_SERVICE_.union.svc.cluster.local:80' + storage.yaml: | + storage: + type: s3 + container: "" + connection: + auth-type: iam + region: + enable-multicontainer: false + limits: + maxDownloadMBs: 10 + cache: + max_size_mbs: 1024 + target_gc_percent: 70 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] + externalClient: + forwardHeaders: + - authorization + - flyte-authorization + internalCommunicationConfig: + enabled: false + type: Noop + useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'authorizer:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + cloudProvider: + provider: Mock + cluster: + cloudflare: + active: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'cluster:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + dataproxy: + clusterSelector: + type: local + secureTunnelTenantURLPattern: '' + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'dataproxy:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + cloudEventsProcessor: + cloudProvider: Local + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + eventsProxy: + recorderType: RunService + executions: + app: + adminClient: + connection: + authorizationHeader: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + scopes: + - 'all' + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + apps: + enrichIdentities: false + publicURLPattern: https://%s.apps. + llm: + enabled: false + task: + clusterCacheConfig: + ttl: 10m + enabled: true + enrichIdentities: false + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'executions:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 + workspace: + enable: false +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + queue: + db: + hosts: + - 'scylla-client.union.svc.cluster.local' + threadCount: 64 + type: cql + eventer: + recordActionThreadCount: 16 + type: runservice + updateActionStatusThreadCount: 16 + sharedService: + metrics: + scope: 'queue:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'run-scheduler:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + billing: + enable: false + cache: + identity: + enabled: false + cloudProvider: + provider: Mock + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'usage:' + security: + singleTenantOrgID: '' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 + usage: + taskMetrics: + agentQuery: + mappings: + dgx_job: + queries: + EXECUTION_METRIC_ALLOCATED_CPU_AVG: CPU_ALLOCATION:MEAN + EXECUTION_METRIC_ALLOCATED_MEMORY_BYTES_AVG: MEM_ALLOCATION:MEAN + EXECUTION_METRIC_CPU_UTILIZATION: CPU_UTILIZATION:MEAN + EXECUTION_METRIC_GPU_UTILIZATION: GPU_UTILIZATION:MEAN + EXECUTION_METRIC_MEMORY_UTILIZATION: MEM_UTILIZATION:MEAN + metricDelayToleranceDuration: 0s + promQuery: + queries: + EXECUTION_METRIC_ALLOCATED_CPU_AVG: | + max by (namespace, pod) ( + ( + sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m])) > + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"}) + ) + or + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"}) + ) * + on (namespace, pod) group_left max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_ALLOCATED_MEMORY_BYTES_AVG: | + max by (namespace, pod) ( + ( + sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}) > + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"}) + ) + or + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"}) + ) * + on (namespace, pod) group_left max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_APP_REPLICA_COUNT: | + sum (kube_pod_status_phase{phase=~"Running|Pending", namespace="{{.Namespace}}", pod=~"{{.AppName}}.*"} == 1) or vector(0) + EXECUTION_METRIC_APP_REQUESTS: | + sum(rate(( + envoy_cluster_upstream_rq_xx{ + job="serving-envoy", + project=~"{{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, envoy_response_code_class) + EXECUTION_METRIC_APP_RESPONSE_TIME_P50: | + histogram_quantile(0.5, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_APP_RESPONSE_TIME_P90: | + histogram_quantile(0.90, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_APP_RESPONSE_TIME_P95: | + histogram_quantile(0.95, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_CPU_UTILIZATION: | + (sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m])) / + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_GPU_FRAME_BUFFER_UTILIZATION: | + (sum by (namespace, pod, gpu) (DCGM_FI_DEV_FB_USED{namespace="{{.Namespace}}",pod=~"{{.PodName}}"}) / + sum by (namespace, pod, gpu) (DCGM_FI_DEV_FB_USED{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} + DCGM_FI_DEV_FB_FREE{namespace="{{.Namespace}}",pod=~"{{.PodName}}"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_GPU_MEMORY_UTILIZATION: | + sum by (gpu) (DCGM_FI_DEV_MEM_COPY_UTIL{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) / 100.0 + EXECUTION_METRIC_GPU_SM_ACTIVE: | + sum by (gpu) (DCGM_FI_PROF_SM_ACTIVE{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_GPU_SM_OCCUPANCY: | + sum by (gpu) (DCGM_FI_PROF_SM_OCCUPANCY{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_GPU_UTILIZATION: | + sum by (gpu) (DCGM_FI_DEV_GPU_UTIL{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) / 100.0 + EXECUTION_METRIC_LIMIT_CPU: | + sum by (namespace, pod) (kube_pod_container_resource_limits{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_LIMIT_MEMORY_BYTES: | + sum by (namespace, pod) (kube_pod_container_resource_limits{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_MEMORY_UTILIZATION: | + (sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}) / + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_REQUEST_CPU: | + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_REQUEST_MEMORY_BYTES: | + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_USED_CPU_AVG: | + sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m]) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_USED_MEMORY_BYTES_AVG: | + sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + workers: 10 +--- +# Source: controlplane/templates/monitoring/dashboard-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-dashboard-union-controlplane-overview + namespace: union + labels: + grafana_dashboard: "1" + app.kubernetes.io/managed-by: Helm +data: + union-controlplane-overview.json: |- + { + "annotations": { + "list": [] + }, + "description": "Union Controlplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Connect Error Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval])) / sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Error %", + "refId": "A" + } + ], + "description": "Fraction of Connect RPC responses with non-OK/non-Canceled codes across all CP services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "none" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Errors by Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "description": "Connect error responses by gRPC status code (Internal, Unavailable, etc.)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(authorizer:handler_panic{namespace=\"$namespace\"} + cluster:handler_panic{namespace=\"$namespace\"} + dataproxy:handler_panic{namespace=\"$namespace\"} + executions:handler_panic{namespace=\"$namespace\"} + queue:handler_panic{namespace=\"$namespace\"} + usage:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics across all CP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. <0 = budget exhausted. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Ingress Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_success_rate or (1 - sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"5..\"}[5m])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress success rate (non-5xx). Customer-facing SLO metric. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Ingress Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_latency_p99 or histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress p99 latency. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "Service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 100, + "title": "Ingress (nginx)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 101, + "title": "Request Rate by Path", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ host }}{{ path }}", + "refId": "A" + } + ], + "description": "Ingress request rate broken down by host and URL path." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 102, + "title": "Error Rate by Status Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (status) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"[45]..\"}[$__rate_interval]))", + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 103, + "title": "Latency p50 / p95 / p99", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 104, + "title": "Active Connections", + "type": "timeseries", + "targets": [ + { + "expr": "sum(nginx_ingress_controller_nginx_process_connections{namespace=\"$namespace\"})", + "legendFormat": "Active", + "refId": "A" + } + ], + "description": "Current number of active client connections to ingress-nginx." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 200, + "title": "Connect / gRPC", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 201, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 202, + "title": "Connect Errors by Service & Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service, code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ service }} {{ code }}", + "refId": "A" + } + ], + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 203, + "title": "gRPC Server Request Rate (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_server_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 204, + "title": "gRPC Server Errors (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_method, grpc_code) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "CacheService gRPC errors by method and code." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 300, + "title": "FlyteAdmin (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 301, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "flyte:admin:node_execution_manager:active_node_executions{namespace=\"$namespace\"}", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "flyte:admin:task_execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 302, + "title": "Execution Create / Event Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:executions_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Executions created", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Workflow events", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:node_execution_manager:node_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Node events", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:task_execution_manager:task_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Task events", + "refId": "D" + } + ], + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 303, + "title": "Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:propeller_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller failures", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:transformer_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Transformer errors", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:execution_manager:publish_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Publish errors", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_termination_failure{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Termination failures", + "refId": "D" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 304, + "title": "Endpoint Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:create_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecution", + "refId": "A" + }, + { + "expr": "flyte:admin:create_execution_event:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecutionEvent", + "refId": "B" + }, + { + "expr": "flyte:admin:get_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "GetExecution", + "refId": "C" + }, + { + "expr": "flyte:admin:list_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "ListExecution", + "refId": "D" + } + ], + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 305, + "title": "Auth Middleware Decisions", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:middleware:authorization:authz_approved{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Approved", + "refId": "A" + }, + { + "expr": "rate(flyte:middleware:authorization:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Denied", + "refId": "B" + } + ], + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "Executions (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 401, + "title": "Execution Create / Ack Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Create", + "refId": "A" + }, + { + "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Ack", + "refId": "B" + } + ], + "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 402, + "title": "Execution Create / Ack Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Create p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Ack p95", + "refId": "B" + } + ], + "description": "Time to prepare create/ack execution requests at p95." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 403, + "title": "Assignment Duration (p50 / p90)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p90", + "refId": "B" + } + ], + "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 404, + "title": "Workqueue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Send ops", + "refId": "A" + }, + { + "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Claims", + "refId": "B" + }, + { + "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Send failures", + "refId": "C" + }, + { + "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Claim failures", + "refId": "D" + } + ], + "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 405, + "title": "DB Operation Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (op) (rate(label_replace({__name__=~\"executions:database:postgres:repositories:execution_ops:.*_count\", namespace=\"$namespace\"}, \"op\", \"$1\", \"__name__\", \"executions:database:postgres:repositories:execution_ops:(.*)_count\")[$__rate_interval:]))", + "legendFormat": "{{ op }}", + "refId": "A" + } + ], + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 406, + "title": "DB Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(executions:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(executions:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 407, + "title": "Cluster Cache Hit/Miss", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:executions:list_clusters:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster hits", + "refId": "A" + }, + { + "expr": "rate(executions:executions:list_clusters:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster miss", + "refId": "B" + }, + { + "expr": "rate(executions:executions:list_nodepools:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool hits", + "refId": "C" + }, + { + "expr": "rate(executions:executions:list_nodepools:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool miss", + "refId": "D" + } + ], + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 408, + "title": "Pending Assignments", + "type": "timeseries", + "targets": [ + { + "expr": "executions:app:leaser:pending_assignment_unlabeled{namespace=\"$namespace\"}", + "legendFormat": "Pending", + "refId": "A" + } + ], + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 409, + "title": "First Ack Latency (V2 SLI)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 410, + "title": "V2 Run Dispatch", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run:runs_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs sent", + "refId": "A" + }, + { + "expr": "rate(executions:run:actions_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Actions sent", + "refId": "B" + }, + { + "expr": "rate(executions:run:enqueue_action_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueue failures", + "refId": "C" + } + ], + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 411, + "title": "V2 Run Notifier", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run_notifier:notifications_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Notifications/s", + "refId": "A" + }, + { + "expr": "executions:run_notifier:subscribers{namespace=\"$namespace\"}", + "legendFormat": "Subscribers", + "refId": "B" + }, + { + "expr": "rate(executions:run:logs:tail_logs_bytes_read{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Log bytes/s", + "refId": "C" + } + ], + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 500, + "title": "Queue / Run-Scheduler (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 13 + }, + "id": 501, + "title": "Metadata Store Counts", + "type": "timeseries", + "targets": [ + { + "expr": "queue:metadata_store:total_run_count{namespace=\"$namespace\"}", + "legendFormat": "Total runs", + "refId": "A" + }, + { + "expr": "queue:metadata_store:total_action_count{namespace=\"$namespace\"}", + "legendFormat": "Total actions", + "refId": "B" + }, + { + "expr": "queue:metadata_store:scheduled_run_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled runs", + "refId": "C" + }, + { + "expr": "queue:metadata_store:scheduled_action_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled actions", + "refId": "D" + } + ], + "description": "Total and scheduled run/action counts in the queue. Shows system load." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 13 + }, + "id": 502, + "title": "Scheduler / Runner / Aborter Throughput", + "type": "timeseries", + "targets": [ + { + "expr": "rate(queue:scheduler:enqueued_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueued", + "refId": "A" + }, + { + "expr": "rate(queue:runner:completed_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Completed", + "refId": "B" + }, + { + "expr": "rate(queue:aborter:aborted_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Aborted", + "refId": "C" + } + ], + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 13 + }, + "id": 503, + "title": "Queue Lengths", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Scheduler input", + "refId": "A" + }, + { + "expr": "queue:runner:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Runner input", + "refId": "B" + }, + { + "expr": "queue:aborter:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Aborter input", + "refId": "C" + }, + { + "expr": "queue:dispatcher:chain_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Dispatcher chain", + "refId": "D" + }, + { + "expr": "queue:db:queue_length{namespace=\"$namespace\"}", + "legendFormat": "DB queue", + "refId": "E" + } + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 21 + }, + "id": 504, + "title": "Dispatcher Operation Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (type, le) (rate(queue:dispatcher:operation_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 21 + }, + "id": 505, + "title": "State Get/Put Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:get_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Get p99", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:put_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Put p99", + "refId": "B" + } + ], + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 21 + }, + "id": 506, + "title": "State Cache & Eventer", + "type": "timeseries", + "targets": [ + { + "expr": "queue:state:active_states{namespace=\"$namespace\"}", + "legendFormat": "Active states", + "refId": "A" + }, + { + "expr": "queue:state:terminal_states{namespace=\"$namespace\"}", + "legendFormat": "Terminal states", + "refId": "B" + }, + { + "expr": "rate(queue:eventer:record_action_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Eventer errors", + "refId": "C" + } + ], + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 29 + }, + "id": 507, + "title": "Worker Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:worker_capacity{namespace=\"$namespace\"}", + "legendFormat": "{{ worker_name }}", + "refId": "A" + } + ], + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 29 + }, + "id": 508, + "title": "Dispatcher Failures by Type", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (type) (rate(queue:dispatcher:operation_failures{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 29 + }, + "id": 509, + "title": "DB & Client Thread Pool", + "type": "timeseries", + "targets": [ + { + "expr": "queue:db:free_threads{namespace=\"$namespace\"}", + "legendFormat": "DB free threads", + "refId": "A" + }, + { + "expr": "queue:queue_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "Queue client free", + "refId": "B" + }, + { + "expr": "queue:state_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "State client free", + "refId": "C" + } + ], + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 600, + "title": "Cluster Service (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 601, + "title": "UpdateStatus / Heartbeat Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:svc:update_status:updates_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "UpdateStatus", + "refId": "A" + }, + { + "expr": "rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat", + "refId": "B" + } + ], + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 602, + "title": "Cluster API Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "UpdateStatus p95", + "refId": "A" + }, + { + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "Heartbeat p95", + "refId": "B" + } + ], + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 603, + "title": "Operator / Propeller Restarts (from DP)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:operator_restarts{namespace=\"$namespace\"}", + "legendFormat": "Operator restarts", + "refId": "A" + }, + { + "expr": "cluster:svc:update_status:propeller_restarts{namespace=\"$namespace\"}", + "legendFormat": "Propeller restarts", + "refId": "B" + } + ], + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 604, + "title": "DB Errors by Type", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(cluster:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(cluster:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Healthy", + "color": "green" + }, + "1": { + "text": "Unhealthy", + "color": "red" + } + } + } + ] + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 605, + "title": "Cluster Health Status", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:unhealthy{namespace=\"$namespace\", subsystem=\"\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 606, + "title": "Last Heartbeat Age (stale cluster detection)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:last_update_age{namespace=\"$namespace\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 607, + "title": "Managed Cluster Cache", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:managed_cluster_client_cache:get:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache hits", + "refId": "A" + }, + { + "expr": "rate(cluster:managed_cluster_client_cache:get:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache miss", + "refId": "B" + } + ], + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 900, + "title": "CacheService (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 901, + "title": "Cache Hit / Miss Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "C" + } + ], + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 902, + "title": "Reservation Contention & Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Contention", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation acquired", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation released", + "refId": "C" + } + ], + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 750, + "title": "Authorizer (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 15 + }, + "id": 751, + "title": "Allow / Deny Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Allowed", + "refId": "A" + }, + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Denied", + "refId": "B" + } + ], + "description": "Authorization decision rate. Allow/deny ratio indicates auth health. High deny rate may signal misconfigured policies. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 15 + }, + "id": 752, + "title": "Authorize Latency", + "type": "timeseries", + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "End-to-end Authorize() latency including identity resolution and backend authorization check. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 15 + }, + "id": 753, + "title": "Deny Rate (%)", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Deny %", + "refId": "A" + } + ], + "description": "Percentage of authorization decisions that denied access. Spikes indicate policy changes or auth issues. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } + } + } + ] + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 23 + }, + "id": 760, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^type$/" + }, + "textMode": "value" + }, + "title": "Authorizer Mode", + "type": "stat", + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Currently active authorizer backend type (Noop, UserClouds, External, Authorizer)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 4, + "y": 23 + }, + "id": 761, + "title": "External Backend Latency", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Latency of calls to the external authorization backend (p50/p95/p99)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 23 + }, + "id": 762, + "title": "External Errors by gRPC Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_code) (rate(authorizer:authorizer:cloudauthorizer:connect:external:errors{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_code }}", + "refId": "A" + } + ], + "description": "Error rate from the external authorization backend, broken down by gRPC status code." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 23 + }, + "id": 763, + "title": "Fail-Open Activations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Fail-Open", + "refId": "A" + } + ], + "description": "Rate of fail-open activations. Non-zero means the external backend is unreachable and requests are being allowed without authorization." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 764, + "title": "Decisions by Action", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "allowed: {{ action }}", + "refId": "A" + }, + { + "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "denied: {{ action }}", + "refId": "B" + } + ], + "description": "Authorization decisions broken down by action (e.g. read, write, execute). Stacked to show total volume." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 765, + "title": "Error Attribution", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ error_source }}", + "refId": "A" + } + ], + "description": "Authorization errors attributed by source (e.g. identity resolution, backend, policy evaluation)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 700, + "title": "Data Proxy", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 701, + "title": "Cache Hit/Miss Rates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(dataproxy:domains:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain hits", + "refId": "A" + }, + { + "expr": "rate(dataproxy:domains:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain miss", + "refId": "B" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool hits", + "refId": "C" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool miss", + "refId": "D" + } + ], + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 702, + "title": "Image Read Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Success p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:failure_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Failure p95", + "refId": "B" + } + ], + "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 703, + "title": "Secret Proxy Errors by Cluster", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (cluster, operation) (rate(dataproxy:secrets_service:cluster_errors{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ cluster }} {{ operation }}", + "refId": "A" + } + ], + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 800, + "title": "Usage Service", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 801, + "title": "Billable Usage Reports", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:svc:report_billable_usage{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reports/s", + "refId": "A" + } + ], + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 802, + "title": "Message Pipeline", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:messages:messages_received{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Received", + "refId": "A" + }, + { + "expr": "rate(usage:messages:messages_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sent", + "refId": "B" + }, + { + "expr": "rate(usage:messages:messages_dropped{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Dropped", + "refId": "C" + }, + { + "expr": "rate(usage:messages:messages_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "D" + } + ], + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 803, + "title": "Messages by Type (success)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (message_type) (rate(usage:messages:messages_processed{namespace=\"$namespace\", outcome=\"success\"}[$__rate_interval]))", + "legendFormat": "{{ message_type }}", + "refId": "A" + } + ], + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 804, + "title": "Message Processing Latency", + "type": "timeseries", + "targets": [ + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "controlplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "union", + "value": "union" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "union", + "value": "union" + } + ], + "query": "union", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Controlplane Overview", + "uid": "union-cp-overview", + "version": 2 + } +--- +# Source: controlplane/templates/scylla/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: scylladb +provisioner: ebs.csi.eks.amazonaws.com +volumeBindingMode: WaitForFirstConsumer +parameters: + fsType: ext4 + type: gp2 +reclaimPolicy: Delete +allowVolumeExpansion: true +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-flyteadmin + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + - flyte.lyft.com + - rbac.authorization.k8s.io + resources: + - configmaps + - flyteworkflows + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - spark-role + - limitranges + verbs: + - '*' +--- +# Source: controlplane/charts/ingress-nginx/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + name: controlplane-nginx +rules: + - apiGroups: + - "" + resources: + - configmaps + - endpoints + - nodes + - pods + - secrets + - namespaces + verbs: + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - apiGroups: + - networking.k8s.io + resources: + - ingresses/status + verbs: + - update + - apiGroups: + - networking.k8s.io + resources: + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch + - get +--- +# Source: controlplane/charts/scylla-operator/templates/edit_clusterrole.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scyllacluster-edit + labels: + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - create + - patch + - update + - delete + - deletecollection +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:operator +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - "" + resources: + - nodes + - endpoints + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - delete + - get + - list + - patch + - update + - watch + - patch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create +- apiGroups: + - "" + resources: + - configmaps + - endpoints + - namespaces + - secrets + - serviceaccounts + - services + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - remotekubernetesclusters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters/status + - scylladbmonitorings/status + - scylladbdatacenters/status + - remotekubernetesclusters/status + - scylladbclusters/status + - scylladbmanagerclusterregistrations/status + - scylladbmanagertasks/status + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + - roles + - rolebindings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaoperatorconfigs + - scyllaoperatorconfigs/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - monitoring.coreos.com + resources: + - prometheuses + - prometheusrules + - servicemonitors + verbs: + - get + - list + - watch + - create + - patch + - update + - delete +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + - pods/finalizers + verbs: + - update +- apiGroups: + - apps + resources: + - daemonsets/finalizers + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters/finalizers + - scylladbdatacenters/finalizers + - scylladbmonitorings/finalizers + - scylladbmanagerclusterregistrations/finalizers + - scylladbmanagertasks/finalizers + verbs: + - update +- apiGroups: + - policy + resources: + - poddisruptionbudgets/finalizers + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs/finalizers + verbs: + - update +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + - pods/finalizers + verbs: + - update +- apiGroups: + - apps + resources: + - daemonsets/finalizers + verbs: + - update +- apiGroups: + - policy + resources: + - poddisruptionbudgets/finalizers + verbs: + - update +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/operator_remote.clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:operator-remote +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator-remote: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/operator_remote.clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator-remote + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator-remote: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scylladbdatacenters + - remoteowners + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scylladbdatacenters/status + - remoteowners/status + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - endpoints + - namespaces + - services + - secrets + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scyllacluster-member +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scyllacluster-member + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - "apps" + resources: + - statefulsets + verbs: + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + verbs: + - update +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scyllacluster-member-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_grafana_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:monitoring:grafana +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-grafana: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_grafana_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-grafana-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-grafana: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:monitoring:prometheus +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-prometheus + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-prometheus-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/view_clusterrole.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scyllacluster-view + labels: + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" + rbac.authorization.k8s.io/aggregate-to-view: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - get + - list + - watch +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-flyteadmin-binding + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-flyteadmin +subjects: +- kind: ServiceAccount + name: flyteadmin + namespace: union +--- +# Source: controlplane/charts/ingress-nginx/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + name: controlplane-nginx +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: controlplane-nginx +subjects: + - kind: ServiceAccount + name: controlplane-nginx + namespace: union +--- +# Source: controlplane/charts/scylla-operator/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: scylladb:controller:operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: scylladb:controller:operator +subjects: +- kind: ServiceAccount + name: scylla-operator + namespace: scylla-operator +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union +rules: + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - apiGroups: + - "" + resources: + - configmaps + - pods + - secrets + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + # Omit Ingress status permissions if `--update-status` is disabled. + - apiGroups: + - networking.k8s.io + resources: + - ingresses/status + verbs: + - update + - apiGroups: + - networking.k8s.io + resources: + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + resourceNames: + - controlplane-nginx-leader + verbs: + - get + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch + - get +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + #app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + - flyte.lyft.com + - rbac.authorization.k8s.io + resources: + - configmaps + - flyteworkflows + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - spark-role + verbs: + - '*' +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: controlplane-nginx +subjects: + - kind: ServiceAccount + name: controlplane-nginx + namespace: union +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: flyteadmin-binding + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + #app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: flyteadmin +subjects: + - kind: ServiceAccount + name: flyteadmin + namespace: union +--- +# Source: controlplane/charts/flyte/templates/admin/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8088 + - name: grpc + port: 81 + protocol: TCP + # intentionally set to TCP instead of grpc + targetPort: 8089 + - name: redoc + protocol: TCP + port: 87 + targetPort: 8087 + - name: http-metrics + protocol: TCP + port: 10254 + selector: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/flyte/templates/console/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconsole + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm + annotations: + external-dns.alpha.kubernetes.io/hostname: flyte.example.com + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "600" +spec: + type: ClusterIP + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-service-metrics.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller-metrics + namespace: union +spec: + type: ClusterIP + ports: + - name: metrics + port: 10254 + protocol: TCP + targetPort: metrics + selector: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-service.yaml +apiVersion: v1 +kind: Service +metadata: + annotations: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +spec: + type: ClusterIP + ipFamilyPolicy: SingleStack + ipFamilies: + - IPv4 + ports: + - name: http + port: 80 + protocol: TCP + targetPort: http + appProtocol: http + - name: https + port: 443 + protocol: TCP + targetPort: https + appProtocol: https + selector: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: scylla-operator + name: scylla-operator-webhook + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +spec: + ports: + - port: 443 + targetPort: 5000 + name: webhook + selector: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/cacheservice/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cacheservice + namespace: union + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: http + port: 88 + protocol: TCP + targetPort: http + - name: grpc + port: 89 + protocol: TCP + targetPort: grpc + - name: http-metrics + protocol: TCP + port: 10254 + selector: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/console/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: unionconsole + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + - port: 8081 + targetPort: http-metrics + protocol: TCP + name: http-metrics + selector: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: authorizer + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cluster + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: dataproxy + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: executions + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: queue + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: run-scheduler + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: usage + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/flyte/templates/admin/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + configChecksum: "b03b2e79a2b21bbfee529edc64a35bd90243ef79845c3299905bec8bfa6dce4" + kubectl.kubernetes.io/default-container: flyteadmin + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + initContainers: + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - migrate + - run + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + name: run-migrations + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/flyte/config + name: base-config-volume + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - migrate + - seed-projects + - union-health-monitoring + - flytesnacks + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + name: seed-projects + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/flyte/config + name: base-config-volume + - name: generate-secrets + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + command: ["/bin/sh", "-c"] + args: + [ + "flyteadmin --config=/etc/flyte/config/*.yaml secrets init --localPath /etc/scratch/secrets && flyteadmin --config=/etc/flyte/config/*.yaml secrets create --name flyte-admin-secrets --fromPath /etc/scratch/secrets", + ] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/flyte/config + name: base-config-volume + - mountPath: /etc/scratch + name: scratch + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + containers: + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - serve + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + name: flyteadmin + ports: + - containerPort: 8088 + - containerPort: 8089 + - containerPort: 10254 + readinessProbe: + httpGet: + path: /healthcheck + port: 8088 + initialDelaySeconds: 15 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthcheck + port: 8088 + initialDelaySeconds: 20 + timeoutSeconds: 1 + periodSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + resources: + limits: + cpu: 2 + ephemeral-storage: 500Mi + memory: 3Gi + requests: + cpu: 50m + ephemeral-storage: 200Mi + memory: 500Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /srv/flyte + name: shared-data + - mountPath: /etc/flyte/config + name: clusters-config-volume + - mountPath: /etc/secrets/ + name: admin-secrets + - mountPath: /etc/secrets/union + name: union-secrets + readOnly: true + serviceAccountName: flyteadmin + volumes: + - name: union-controlplane-secrets + secret: + secretName: union-controlplane-secrets + - emptyDir: {} + name: shared-data + - emptyDir: {} + name: scratch + - projected: + sources: + - configMap: + name: flyte-admin-base-config + name: base-config-volume + - projected: + sources: + - configMap: + name: flyte-admin-base-config + - configMap: + name: flyte-admin-clusters-config + name: clusters-config-volume + - name: admin-secrets + secret: + secretName: flyte-admin-secrets + - name: union-secrets + secret: + secretName: '' +--- +# Source: controlplane/charts/flyte/templates/console/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconsole + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "2620ed20cf30d64460b231bbcf13fc096a23b6d373b46e69ab5f2e051f3d3d1" + linkerd.io/inject: disabled + prometheus.io/scrape: "false" + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm + spec: + imagePullSecrets: + - name: union-registry-secret + securityContext: + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1000 + seLinuxOptions: + type: spc_t + containers: + - image: "registry.unionai.cloud/controlplane/flyteconsole:" + imagePullPolicy: "IfNotPresent" + name: flyteconsole + envFrom: + - configMapRef: + name: flyte-console-config + ports: + - containerPort: 8080 + env: + - name: ENABLE_GA + value: "true" + - name: GA_TRACKING_ID + value: "" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 250m + ephemeral-storage: 200Mi + memory: 250Mi + requests: + cpu: 10m + ephemeral-storage: 20Mi + memory: 50Mi + volumeMounts: + - mountPath: /srv/flyte + name: shared-data + volumes: + - emptyDir: {} + name: shared-data +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +spec: + selector: + matchLabels: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller + replicas: 1 + revisionHistoryLimit: 10 + minReadySeconds: 0 + template: + metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + spec: + dnsPolicy: ClusterFirst + containers: + - name: controller + image: registry.k8s.io/ingress-nginx/controller:v1.12.3@sha256:ac444cd9515af325ba577b596fe4f27a34be1aa330538e8b317ad9d6c8fb94ee + imagePullPolicy: IfNotPresent + lifecycle: + preStop: + exec: + command: + - /wait-shutdown + args: + - /nginx-ingress-controller + - --publish-service=$(POD_NAMESPACE)/controlplane-nginx-controller + - --election-id=controlplane-nginx-leader + - --controller-class=union.ai/controlplane + - --ingress-class=nginx + - --configmap=$(POD_NAMESPACE)/controlplane-nginx-controller + - --enable-metrics=true + - --default-ssl-certificate=/ + securityContext: + runAsNonRoot: true + runAsUser: 101 + runAsGroup: 82 + allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault + capabilities: + drop: + - ALL + add: + - NET_BIND_SERVICE + readOnlyRootFilesystem: false + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: LD_PRELOAD + value: /usr/local/lib/libmimalloc.so + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + ports: + - name: http + containerPort: 80 + protocol: TCP + - name: https + containerPort: 443 + protocol: TCP + - name: metrics + containerPort: 10254 + protocol: TCP + resources: + requests: + cpu: 100m + memory: 90Mi + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: controlplane-nginx + terminationGracePeriodSeconds: 300 +--- +# Source: controlplane/charts/scylla-operator/templates/operator.deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: scylla-operator + namespace: scylla-operator + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +spec: + replicas: 2 + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator + template: + metadata: + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator + spec: + serviceAccountName: scylla-operator + containers: + - name: scylla-operator + image: scylladb/scylla-operator:1.18.1 + imagePullPolicy: IfNotPresent + env: + - name: SCYLLA_OPERATOR_IMAGE + value: scylladb/scylla-operator:1.18.1 + args: + - operator + - --loglevel=2 + resources: + requests: + cpu: 100m + memory: 20Mi + terminationGracePeriodSeconds: 10 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: scylla-operator + app.kubernetes.io/name: scylla-operator + topologyKey: kubernetes.io/hostname + weight: 1 +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: scylla-operator + name: webhook-server + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +spec: + replicas: 2 + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server + template: + metadata: + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server + spec: + serviceAccountName: "webhook-server" + containers: + - name: webhook-server + image: scylladb/scylla-operator:1.18.1 + imagePullPolicy: IfNotPresent + args: + - run-webhook-server + - --loglevel=2 + - --tls-cert-file=/tmp/serving-certs/tls.crt + - --tls-private-key-file=/tmp/serving-certs/tls.key + livenessProbe: + httpGet: + path: /readyz + port: 5000 + scheme: HTTPS + readinessProbe: + httpGet: + path: /readyz + port: 5000 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + lifecycle: + preStop: + exec: + command: + - /usr/bin/sleep + - 15s + ports: + - containerPort: 5000 + name: webhook-server + protocol: TCP + resources: + requests: + cpu: 10m + memory: 20Mi + volumeMounts: + - mountPath: /tmp/serving-certs + name: cert + readOnly: true + terminationGracePeriodSeconds: 75 + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: scylla-operator-serving-cert + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: webhook-server + app.kubernetes.io/name: webhook-server + topologyKey: kubernetes.io/hostname + weight: 1 +--- +# Source: controlplane/templates/cacheservice/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cacheservice + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "197f4097faabcce1c83bbd79953460800854d6b92837333f3dd60c6c1bfa14a" + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/managed-by: Helm + spec: + securityContext: + fsGroup: 1001 + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + initContainers: + - command: + - cacheservice + - --config + - /etc/cacheservice/config/*.yaml + - migrate + - run + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + name: run-migrations + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/cacheservice/config + name: config-volume + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + containers: + - command: + - cacheservice + - --config + - /etc/cacheservice/config/*.yaml + - serve + image: "registry.unionai.cloud/controlplane/services:" + imagePullPolicy: "IfNotPresent" + name: cacheservice + ports: + - name: http + containerPort: 8088 + - name: grpc + containerPort: 8089 + - name: http-metrics + containerPort: 10254 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 1 + ephemeral-storage: 200Mi + requests: + cpu: 500m + ephemeral-storage: 200Mi + memory: 200Mi + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/cacheservice/config + name: config-volume + - mountPath: /etc/secrets/union + name: union-secrets + readOnly: true + serviceAccountName: cacheservice + volumes: + - name: union-controlplane-secrets + secret: + secretName: union-controlplane-secrets + - emptyDir: {} + name: shared-data + - configMap: + name: cacheservice-config + name: config-volume + - name: union-secrets + secret: + secretName: '' + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: cacheservice + topologyKey: kubernetes.io/hostname +--- +# Source: controlplane/templates/console/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: unionconsole + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: unionconsole + securityContext: + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1000 + seLinuxOptions: + type: spc_t + containers: + - name: unionconsole + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + image: "registry.unionai.cloud/controlplane/unionconsole:2026.4.7" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + protocol: TCP + - name: http-metrics + containerPort: 8081 + protocol: TCP + env: + - name: UNION_ORG_OVERRIDE + value: '' + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi +--- +# Source: controlplane/templates/deployment.yaml +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: authorizer + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: authorizer + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: authorizer + containers: + - name: authorizer + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - authorizer + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: cluster + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: cluster + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: cluster + initContainers: + - name: cluster-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudcluster + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: cluster + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudcluster + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: dataproxy + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: dataproxy + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: dataproxy + containers: + - name: dataproxy + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - dataproxy + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: executions + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: executions + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: executions + initContainers: + - name: executions-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: executions + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: queue + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: queue + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: queue + initContainers: + - name: queue-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - queue + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: queue + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - queue + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: run-scheduler + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: run-scheduler + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: run-scheduler + initContainers: + - name: run-scheduler-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: run-scheduler + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - scheduler + - start + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: usage + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + spec: + imagePullSecrets: + - name: union-registry-secret + serviceAccountName: usage + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: usage + containers: + - name: usage + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + args: + - usage + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 3 + memory: 512Mi + requests: + cpu: 500m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/charts/flyte/templates/admin/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteadmin + minReplicas: 1 + maxReplicas: 10 + metrics: + + - resource: + name: cpu + target: + averageUtilization: 80 + type: Utilization + type: Resource + - resource: + name: memory + target: + averageUtilization: 80 + type: Utilization + type: Resource +--- +# Source: controlplane/templates/console/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: unionconsole + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: authorizer +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: authorizer + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: cluster +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: cluster + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: dataproxy +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: dataproxy + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: executions +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: executions + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: run-scheduler +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: run-scheduler + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: usage +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: usage + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-ingressclass.yaml +apiVersion: networking.k8s.io/v1 +kind: IngressClass +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane +spec: + controller: union.ai/controlplane +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-dataproxy + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /data/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /data + pathType: Prefix + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-usage-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me + nginx.ingress.kubernetes.io/use-regex: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /cloudidl.usage.UsageService(/(?!GetCustomMeasuresNames|GetMeasureGroup|GetMeasureGroups|GetBillableMeasures|GetBillingInfo|ReportBillableUsage|ReportServerlessBillableUsage|CreateCustomer|AttachBillingPlanToCustomer|GetCustomerCredits|EnqueueMetronomeRequest|EnqueueStripeRequest|GetOrgCheckoutSession).*|$) + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-usage + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service + nginx.ingress.kubernetes.io/use-regex: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /usage/api/v1(/(?!custom_measures_names|measure_group|measure_groups|billable_measures|billing_info|report_billable_usage|customer_credits|checkout_session).*|$) + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /api + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /api/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /v1/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /cloudadmin + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /cloudadmin/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /actor + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /actor/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /agent + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /agent/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /dataplane + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /dataplane/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /spark-history-server + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /spark-history-server/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /api/v1/dataproxy + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /api/v1/dataproxy/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /app + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /app/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /apps + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /apps/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /cluster + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /cluster/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterpool + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterpool/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterconfig + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterconfig/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /org + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: http + - path: /org/* + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: http + - path: /managed_cluster + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /managed_cluster/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /authorizer + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: http + - path: /authorizer/* + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: http + - path: /oauth_app + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /oauth_app/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /users + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /users/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /members + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /members/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /roles + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /roles/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /policies + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /policies/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /identities + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /identities/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /echo + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /echo/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /execution + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /execution/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_registry + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_registry/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_instance + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_instance/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /usage + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /usage/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /cloudidl.execution.ExecutionService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.execution.ExecutionService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.cluster.ClusterService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterNodepoolService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterNodepoolService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.apikey.APIKeyService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.apikey.APIKeyService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.AppsService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.AppsService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.org.OrgService/* + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: grpc + - path: /cloudidl.org.OrgService + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: grpc + - path: /cloudidl.cloudaccounts.CloudAccountsService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: grpc + - path: /cloudidl.cloudaccounts.CloudAccountsService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: grpc + - path: /cloudidl.cluster.ManagedClusterService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ManagedClusterService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.identity.UserService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.UserService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.MemberService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.MemberService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.RoleService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.RoleService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.PolicyService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.PolicyService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.SelfServe/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.SelfServe + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.clusterpool.ClusterPoolService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterpool.ClusterPoolService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterconfig.ClusterConfigService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterconfig.ClusterConfigService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.authorizer.AuthorizerService/* + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: connect + - path: /cloudidl.authorizer.AuthorizerService + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: connect + - path: /cloudidl.usage.UsageService/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect + - path: /cloudidl.usage.UsageService + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect + - path: /datacatalog.DataCatalog/* + pathType: ImplementationSpecific + backend: + service: + name: datacatalog + port: + name: grpc + - path: /datacatalog.DataCatalog + pathType: ImplementationSpecific + backend: + service: + name: datacatalog + port: + name: grpc + - path: /flyteidl.cacheservice.CacheService/* + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.CacheService + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.v2.CacheService/* + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.v2.CacheService + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.agent.AgentService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.agent.AgentService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.secret.SecretService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.secret.SecretService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.secret.SecretService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.secret.SecretService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.support.SupportService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.clouddataproxy.CloudDataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.clouddataproxy.CloudDataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl.service.DataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl.service.DataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.dataproxy.DataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.dataproxy.DataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.logs.LogsService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.logs.LogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceRegistryService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceRegistryService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TranslatorService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TranslatorService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TriggerService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TriggerService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.QueueService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + + - path: /flyteidl2.workflow.RunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.RunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.TranslatorService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.TranslatorService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.QueueService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.trigger.TriggerService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.trigger.TriggerService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.StateService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.StateService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.imagebuilder.ImageService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.imagebuilder.ImageService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.imagebuilder.ImageService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.imagebuilder.ImageService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.AppService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppLogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.ReplicaService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.AppService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppLogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.ReplicaService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected-grpc-streaming + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /flyteidl2.auth.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.project.ProjectService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.project.ProjectService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AdminService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AdminService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + + - path: /flyteidl.service.WatchService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + + - path: /flyteidl.service.WatchService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.cloudadmin.CloudAdminService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.cloudadmin.CloudAdminService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.echo.EchoService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.echo.EchoService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl.service.SignalService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.SignalService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService/Stream* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.execution.ExecutionService/GetExecutionOperation + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/Record* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/Update* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.LeaseService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.LeaseService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + + - path: /flyteidl2.workflow.RunLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.RunService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.StateService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.logs.LogsService/TailTaskExecutionLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService/WatchWorkspaceInstances + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppService/Watch + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppService/Lease + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.ReplicaService/WatchReplicas + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.AppService/Watch + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppService/Lease + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.ReplicaService/WatchReplicas + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + # Port 87 in FlyteAdmin maps to the redoc container. + - path: /openapi + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: redoc + - path: /healthcheck + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /healthz + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /me + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + # Port 87 in FlyteAdmin maps to the redoc container. + - path: /openapi/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: redoc + - path: /.well-known + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /.well-known/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /login + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /login/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /logout + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /logout/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /callback + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /callback/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /config + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /config/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /oauth2 + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /oauth2/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /auth + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /auth/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /enqueue_metronome_request/v1 + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_metronome_request/v1/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_stripe_request/v1 + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_stripe_request/v1/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +# Certain ingress controllers like nginx cannot serve HTTP 1 and GRPC with a single ingress because GRPC can only +# enabled on the ingress object, not on backend services (GRPC annotation is set on the ingress, not on the services). +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. + - path: /flyteidl.service.HealthService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.HealthService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AuthMetadataService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AuthMetadataService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.AuthMetadataService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.AuthMetadataService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-grpc-streaming + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /flyteidl.service.WatchService/WatchExecutionStatusUpdates + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-console-protected + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: / + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + # NOTE: If you change this, you must update the BASE_URL value in flyteconsole.yaml + - path: /console + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /console/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /dashboard + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /dashboard/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /resources + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /resources/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /cost + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /cost/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /loading + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /loading/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /v2 + pathType: ImplementationSpecific + backend: + service: + name: unionconsole + port: + name: http + - path: /v2/* + pathType: ImplementationSpecific + backend: + service: + name: unionconsole + port: + name: http +--- +# Source: controlplane/charts/scylla-operator/templates/validatingwebhook.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + annotations: + cert-manager.io/inject-ca-from: scylla-operator/scylla-operator-serving-cert + name: scylla-operator +webhooks: +- name: webhook.scylla.scylladb.com + clientConfig: + service: + name: scylla-operator-webhook + namespace: scylla-operator + path: /validate + admissionReviewVersions: + - v1 + sideEffects: None + failurePolicy: Fail + rules: + - apiGroups: + - scylla.scylladb.com + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - scyllaclusters + - apiGroups: + - scylla.scylladb.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - nodeconfigs + - scyllaoperatorconfigs + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-poddisruptionbudget.yaml +# PDB is not supported for DaemonSets. +# https://github.com/kubernetes/kubernetes/issues/108124 +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/charts/scylla-operator/templates/certificate.yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: scylla-operator-serving-cert + namespace: scylla-operator +spec: + dnsNames: + - scylla-operator-webhook.scylla-operator.svc + issuerRef: + kind: Issuer + name: scylla-operator-selfsigned-issuer + secretName: scylla-operator-serving-cert +--- +# Source: controlplane/charts/scylla-operator/templates/issuer.yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: scylla-operator-selfsigned-issuer + namespace: scylla-operator +spec: + selfSigned: {} +--- +# Source: controlplane/charts/scylla/templates/scyllacluster.yaml +apiVersion: scylla.scylladb.com/v1 +kind: ScyllaCluster +metadata: + name: scylla + namespace: union +spec: + version: 2025.1.5 + agentVersion: 3.5.1@sha256:d1b57d08b9949c8faad2048fdf4dc7c502dae81da856c3c6b3a77dd347d5c7fc + repository: scylladb/scylla + agentRepository: scylladb/scylla-manager-agent + developerMode: true + sysctls: + - fs.aio-max-nr=30000000 + datacenter: + name: dc1 + racks: + - agentResources: + requests: + cpu: 50m + memory: 10M + members: 3 + name: rack1 + placement: + nodeAffinity: {} + tolerations: [] + resources: + limits: + cpu: 2 + memory: 4Gi + requests: + cpu: 1 + memory: 2Gi + storage: + capacity: 100Gi + storageClassName: scylladb diff --git a/tests/generated/controlplane.external-authz.yaml b/tests/generated/controlplane.external-authz.yaml index 89d99720..ac76450b 100644 --- a/tests/generated/controlplane.external-authz.yaml +++ b/tests/generated/controlplane.external-authz.yaml @@ -683,6 +683,17 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: failOpen: false forwardHeaders: @@ -691,8 +702,16 @@ data: grpcConfig: host: dns:///my-authz-server.default.svc.cluster.local:50051 insecure: true + internalCommunicationConfig: + enabled: false type: External useExternalIdentity: 'true' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -3581,12 +3600,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3863,17 +3882,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3910,17 +3929,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3971,12 +3990,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4013,17 +4032,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4060,7 +4079,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4089,10 +4108,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4648,17 +4671,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index e6e04d77..473cbef1 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -31,6 +31,24 @@ spec: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + minAvailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name +--- # Source: controlplane/templates/console/pdb.yaml apiVersion: policy/v1 kind: PodDisruptionBudget @@ -217,6 +235,18 @@ metadata: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/templates/cacheservice/rbac.yaml apiVersion: v1 kind: ServiceAccount @@ -586,6 +616,52 @@ data: BASE_URL: /console CONFIG_DIR: /etc/flyte/config --- +# Source: controlplane/templates/authz/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-union-authz-config + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + database: + host: "" + port: 5432 + name: "userclouds" + user: "" + password: "file:///etc/db/pass.txt" + sslMode: "require" + mode: "normal" + + auth: + issuer: "http://release-name-union-authz.union.svc.cluster.local:8080" + signingKey: "kube://secrets/userclouds-signing-key?key=signing_key" + apps: + - credentials: + - clientId: 'union-authz-client' + clientSecret: kube://secrets/?key=client_secret + id: union-controlplane + name: union-controlplane + + cache: + enabled: true + type: "memory" + ttl: "60m" + memory: + maxEntries: 100000 + shards: 128 + depShards: 128 + + services: + checkAttributeEndpoint: "http://localhost:8080" + idpEndpoint: "http://localhost:8080" + authzEndpoint: "http://localhost:8080" +--- # Source: controlplane/templates/cacheservice/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -682,12 +758,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization - type: Noop + internalCommunicationConfig: + enabled: false + type: UserClouds useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -712,12 +807,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -787,12 +882,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -850,12 +945,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -934,12 +1029,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1005,12 +1100,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1074,12 +1169,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1138,12 +1233,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -3576,12 +3671,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3858,17 +3953,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3905,17 +4000,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3966,12 +4061,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4008,17 +4103,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4055,7 +4150,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4084,10 +4179,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4643,17 +4742,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -5806,6 +5905,22 @@ rules: - create - patch --- +# Source: controlplane/templates/authz/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: release-name-union-authz-secrets-manager + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "update", "delete"] +--- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -5879,6 +5994,26 @@ subjects: name: 'envoy-gateway' namespace: 'union' --- +# Source: controlplane/templates/authz/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-union-authz-secrets-manager + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: release-name-union-authz-secrets-manager +subjects: + - kind: ServiceAccount + name: release-name-union-authz + namespace: union +--- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -6012,6 +6147,28 @@ spec: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name +--- # Source: controlplane/templates/cacheservice/service.yaml apiVersion: v1 kind: Service @@ -6814,6 +6971,120 @@ spec: topologyKey: kubernetes.io/hostname weight: 1 --- +# Source: controlplane/templates/authz/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + checksum/config: 8bda5502d8cb82e6d35a0d7495c605eba4ea4137a8294c0149d7b60dafb1d458 + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: release-name-union-authz + terminationGracePeriodSeconds: 45 + securityContext: + fsGroup: 1000 + runAsGroup: 1000 + runAsNonRoot: true + runAsUser: 1000 + containers: + - name: userclouds-lite + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 + imagePullPolicy: IfNotPresent + command: + - userclouds-lite + args: + - serve + - all + - --config=/etc/userclouds/config.yaml + - --addr=:8080 + - --static=/usr/share/userclouds/static + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + resources: + limits: + cpu: "1" + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + lifecycle: + preStop: + exec: + command: ["sleep", "5"] + volumeMounts: + - name: config + mountPath: /etc/userclouds + readOnly: true + - name: db-pass + mountPath: /etc/db + - name: tmp + mountPath: /tmp + volumes: + - name: config + configMap: + name: release-name-union-authz-config + - name: db-pass + secret: + secretName: + - name: tmp + emptyDir: {} + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: union-authz + topologyKey: kubernetes.io/hostname +--- # Source: controlplane/templates/cacheservice/deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -7866,6 +8137,32 @@ spec: type: Utilization type: Resource --- +# Source: controlplane/templates/authz/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.7 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.7" + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: release-name-union-authz + minReplicas: 3 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 +--- # Source: controlplane/templates/console/hpa.yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler diff --git a/tests/values/controlplane.custom-oidc.yaml b/tests/values/controlplane.custom-oidc.yaml new file mode 100644 index 00000000..34d42a76 --- /dev/null +++ b/tests/values/controlplane.custom-oidc.yaml @@ -0,0 +1,57 @@ +# helm-values: values.aws.selfhosted-intracluster.yaml +# Test fixture: Custom OIDC provider configuration. +# Exercises the OAuth2 globals for non-Okta IdPs (e.g. Entra ID, Keycloak). +# All values use generic, non-internal names. + +global: + INTERNAL_CLIENT_ID: "test-internal-client-id" + AUTH_TOKEN_URL: "https://idp.example.com/oauth2/v2.0/token" + OIDC_BASE_URL: "https://idp.example.com/oauth2/v2.0" + OIDC_CLIENT_ID: "00000000-1111-2222-3333-444444444444" + CLI_CLIENT_ID: "55555555-6666-7777-8888-999999999999" + OIDC_METADATA_URL: ".well-known/openid-configuration" + OIDC_ALLOWED_AUDIENCE: + - "api://my-app" + - "00000000-1111-2222-3333-444444444444" + OIDC_APP_SCOPE: "api://my-app/all" + OIDC_APP_AUDIENCE: "api://my-app" + +dbHost: "db-instance-url" +dbName: "dbName" +dbUser: "dbUser" +dbPass: "dbPass" +bucketName: "bucketName" +artifactsBucketName: "artifactsBucketName" + +configMap: + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain +controlplane: + enabled: true +ingress: + host: fake-host.domain + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret +flyte: + common: + ingress: + tls: + secretName: fake-host-tls-secret + host: fake-host.domain + configmap: + admin: + admin: + endpoint: dns:///fake-host.domain + insecure: false + adminServer: + auth: + appAuth: + # Identity type claim mapping for non-Okta IdPs. + # This is set in values overlay, not via a global. + identityTypeClaimsForApps: + idtyp: + - app diff --git a/tests/values/controlplane.userclouds.yaml b/tests/values/controlplane.userclouds.yaml index 300591a6..cd8a2a64 100644 --- a/tests/values/controlplane.userclouds.yaml +++ b/tests/values/controlplane.userclouds.yaml @@ -63,5 +63,8 @@ flyte: endpoint: dns:///fake-host.domain insecure: false -global: - AUTHZ_TYPE: "union" +services: + authorizer: + configMap: + authorizer: + type: "UserClouds"