From f433da6fa823f43b25c3fd41e07985be55a1ddf5 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Thu, 2 Apr 2026 19:06:46 +1100 Subject: [PATCH 01/23] Add defaultIdentityToSubject config for selfhosted IdPs (FAB-189) When true, defaults to user identity if x-user-claim-identitytype header is missing from gRPC metadata. Enables selfhosted deployments with non-Okta IdPs (Apple IdMS, Entra ID) that cannot easily add custom JWT claims. BYOC overrides to false. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 5 +++++ tests/generated/controlplane.aws.billing-enable.yaml | 7 +++++++ tests/generated/controlplane.aws.yaml | 7 +++++++ tests/generated/controlplane.userclouds.yaml | 7 +++++++ 4 files changed, 26 insertions(+) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 9c17bb22..2eb374fd 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -295,6 +295,11 @@ configMap: - authorization - flyte-authorization - x-user-token + # When true, default to user identity if x-user-claim-identitytype + # header is missing (e.g. IdPs without the identitytype custom claim). + # When false, identity resolution fails if the header is absent. + # Defaults to true in selfhosted; BYOC overrides to false. + defaultIdentityToSubject: true cache: identity: enabled: false diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 8b3d730d..71b81720 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -675,6 +675,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -739,6 +740,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -814,6 +816,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -877,6 +880,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -963,6 +967,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1032,6 +1037,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1101,6 +1107,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index 54d471d1..e1bf791d 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -675,6 +675,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -739,6 +740,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -814,6 +816,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -877,6 +880,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -963,6 +967,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1032,6 +1037,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1101,6 +1107,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index e6e04d77..7cf1b894 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -675,6 +675,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -739,6 +740,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -814,6 +816,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -877,6 +880,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -963,6 +967,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1032,6 +1037,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1101,6 +1107,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization From 4320fc9a2dc029d0e62d663bbb22ee034d1a51ec Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Tue, 31 Mar 2026 11:36:43 +1100 Subject: [PATCH 02/23] Provide UserClouds client defaults in controlplane values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pre-configured userCloudsClient defaults under services.authorizer.configMap.authorizer so that enabling Union RBAC only requires setting type: "UserClouds" — no other configuration needed. All connection details (tenantUrl, tenantID, clientID, clientSecretName) are derived from existing chart values. This eliminates the need for Terraform or manual overrides to supply the userCloudsClient block, reducing configuration surface for selfhosted deployments. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 38 ++- .../controlplane.aws.billing-enable.yaml | 19 ++ tests/generated/controlplane.aws.yaml | 19 ++ .../controlplane.external-authz.yaml | 26 ++ tests/generated/controlplane.userclouds.yaml | 323 +++++++++++++++++- tests/values/controlplane.userclouds.yaml | 7 +- 6 files changed, 403 insertions(+), 29 deletions(-) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 2eb374fd..b4face91 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -417,20 +417,9 @@ services: # # Supported types: # - "Noop" — no enforcement (default) - # - "UserClouds" — Union Cloud's authorization backend + # - "UserClouds" — Union RBAC (just set type, defaults are pre-configured) # - "External" — customer-provided gRPC authorization server (selfhosted) # - # --- Union Cloud (UserClouds) --- - # For Union Cloud deployments, set type to "UserClouds": - # authorizer: - # type: "UserClouds" - # userCloudsClient: - # tenantUrl: 'http://{{ .Release.Name }}-union-authz.{{ .Release.Namespace }}.svc.cluster.local:8080' - # tenantID: '623771e7-ddd6-4575-bedb-7c970ec75b87' - # clientID: '{{ .Values.union.authz.clientID }}' - # clientSecretName: 'union/client_secret' - # enableLogging: true - # # --- External Authorization (selfhosted) --- # For selfhosted deployments with a customer-provided authz server: # authorizer: @@ -472,6 +461,31 @@ services: forwardHeaders: - authorization - flyte-authorization + # --- UserClouds client defaults (pre-configured) --- + # These defaults are used when type is set to "UserClouds" (Union RBAC). + # They are ignored when type is "Noop" or "External". + # To enable Union RBAC, just change type to "UserClouds" — no other + # configuration is needed. Override individual fields only if your + # deployment uses non-standard naming or secrets. + userCloudsClient: + tenantUrl: 'http://{{ .Release.Name }}-union-authz.{{ .Release.Namespace }}.svc.cluster.local:8080' + tenantID: '623771e7-ddd6-4575-bedb-7c970ec75b87' + clientID: '{{ .Values.union.authz.clientID }}' + clientSecretName: 'union/client_secret' + enableLogging: true + internalCommunicationConfig: + enabled: false + bootstrap: + organization: "" + domains: + - development + - staging + - production + projects: [] + serviceAccounts: [] + adminUsers: [] + retryInterval: 5s + maxRetries: 30 sharedService: connectPort: 8081 metrics: diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 71b81720..23f4bd98 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -683,12 +683,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization + internalCommunicationConfig: + enabled: false type: Noop useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index e1bf791d..88ccd041 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -683,12 +683,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization + internalCommunicationConfig: + enabled: false type: Noop useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false diff --git a/tests/generated/controlplane.external-authz.yaml b/tests/generated/controlplane.external-authz.yaml index 89d99720..a68fb78d 100644 --- a/tests/generated/controlplane.external-authz.yaml +++ b/tests/generated/controlplane.external-authz.yaml @@ -676,6 +676,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -683,6 +684,17 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: failOpen: false forwardHeaders: @@ -691,8 +703,16 @@ data: grpcConfig: host: dns:///my-authz-server.default.svc.cluster.local:50051 insecure: true + internalCommunicationConfig: + enabled: false type: External useExternalIdentity: 'true' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -744,6 +764,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -819,6 +840,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -882,6 +904,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -968,6 +991,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1037,6 +1061,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1106,6 +1131,7 @@ data: config.yaml: | authorizer: authorizerClient: + defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index 7cf1b894..0cbed0b6 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -31,6 +31,24 @@ spec: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + minAvailable: 2 + selector: + matchLabels: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name +--- # Source: controlplane/templates/console/pdb.yaml apiVersion: policy/v1 kind: PodDisruptionBudget @@ -217,6 +235,18 @@ metadata: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/templates/cacheservice/rbac.yaml apiVersion: v1 kind: ServiceAccount @@ -586,6 +616,52 @@ data: BASE_URL: /console CONFIG_DIR: /etc/flyte/config --- +# Source: controlplane/templates/authz/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-union-authz-config + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + database: + host: "" + port: 5432 + name: "userclouds" + user: "" + password: "file:///etc/db/pass.txt" + sslMode: "require" + mode: "normal" + + auth: + issuer: "http://release-name-union-authz.union.svc.cluster.local:8080" + signingKey: "kube://secrets/userclouds-signing-key?key=signing_key" + apps: + - credentials: + - clientId: 'union-authz-client' + clientSecret: kube://secrets/?key=client_secret + id: union-controlplane + name: union-controlplane + + cache: + enabled: true + type: "memory" + ttl: "60m" + memory: + maxEntries: 100000 + shards: 128 + depShards: 128 + + services: + checkAttributeEndpoint: "http://localhost:8080" + idpEndpoint: "http://localhost:8080" + authzEndpoint: "http://localhost:8080" +--- # Source: controlplane/templates/cacheservice/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -683,12 +759,31 @@ data: grpcConfig: host: dns:///authorizer.union.svc.cluster.local:80 insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] externalClient: forwardHeaders: - authorization - flyte-authorization - type: Noop + internalCommunicationConfig: + enabled: false + type: UserClouds useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 cache: identity: enabled: false @@ -713,12 +808,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -789,12 +884,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -853,12 +948,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -938,12 +1033,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1010,12 +1105,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1080,12 +1175,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -1145,12 +1240,12 @@ data: union: auth: authorizationMetadataKey: flyte-authorization - clientId: '' + clientId: 'test-internal-client-id' clientSecretLocation: /etc/secrets/union/client_secret enable: true scopes: - all - tokenUrl: '' + tokenUrl: 'https://test.example.com/oauth2/v1/token' type: ClientSecret internalConnectionConfig: enabled: true @@ -5813,6 +5908,22 @@ rules: - create - patch --- +# Source: controlplane/templates/authz/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: release-name-union-authz-secrets-manager + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "create", "update", "delete"] +--- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -5886,6 +5997,26 @@ subjects: name: 'envoy-gateway' namespace: 'union' --- +# Source: controlplane/templates/authz/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-union-authz-secrets-manager + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: release-name-union-authz-secrets-manager +subjects: + - kind: ServiceAccount + name: release-name-union-authz + namespace: union +--- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -6019,6 +6150,28 @@ spec: app.kubernetes.io/name: webhook-server app.kubernetes.io/instance: webhook-server --- +# Source: controlplane/templates/authz/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name +--- # Source: controlplane/templates/cacheservice/service.yaml apiVersion: v1 kind: Service @@ -6821,6 +6974,120 @@ spec: topologyKey: kubernetes.io/hostname weight: 1 --- +# Source: controlplane/templates/authz/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + checksum/config: 143023ded44f2db18ddf79507adcbb11c31eb4da967c39179d54bd01bdb07f5c + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: release-name-union-authz + terminationGracePeriodSeconds: 45 + securityContext: + fsGroup: 1000 + runAsGroup: 1000 + runAsNonRoot: true + runAsUser: 1000 + containers: + - name: userclouds-lite + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + command: + - userclouds-lite + args: + - serve + - all + - --config=/etc/userclouds/config.yaml + - --addr=:8080 + - --static=/usr/share/userclouds/static + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /readyz + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + resources: + limits: + cpu: "1" + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + lifecycle: + preStop: + exec: + command: ["sleep", "5"] + volumeMounts: + - name: config + mountPath: /etc/userclouds + readOnly: true + - name: db-pass + mountPath: /etc/db + - name: tmp + mountPath: /tmp + volumes: + - name: config + configMap: + name: release-name-union-authz-config + - name: db-pass + secret: + secretName: + - name: tmp + emptyDir: {} + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: union-authz + topologyKey: kubernetes.io/hostname +--- # Source: controlplane/templates/cacheservice/deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -7873,6 +8140,32 @@ spec: type: Utilization type: Resource --- +# Source: controlplane/templates/authz/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: release-name-union-authz + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union-authz + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: release-name-union-authz + minReplicas: 3 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 +--- # Source: controlplane/templates/console/hpa.yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler diff --git a/tests/values/controlplane.userclouds.yaml b/tests/values/controlplane.userclouds.yaml index 300591a6..cd8a2a64 100644 --- a/tests/values/controlplane.userclouds.yaml +++ b/tests/values/controlplane.userclouds.yaml @@ -63,5 +63,8 @@ flyte: endpoint: dns:///fake-host.domain insecure: false -global: - AUTHZ_TYPE: "union" +services: + authorizer: + configMap: + authorizer: + type: "UserClouds" From 8775de3260c5e619eb6a8ee6b9f9466db5d8b017 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Tue, 31 Mar 2026 21:53:02 +1100 Subject: [PATCH 03/23] Fix dashboard metric name mismatches and query bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Authorizer: use full metric path (authorizer:authorizer:cloudauthorizer:connect:*) - CacheService: add _unlabeled suffix to match actual metric names - Usage: processing_time → processing_time_ms - Cluster API Latency: fix histogram_quantile on summary type (use quantile selector) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-overview.json | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index aecb5405..68435645 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -2299,12 +2299,12 @@ "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -2581,17 +2581,17 @@ "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -2628,17 +2628,17 @@ "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -2689,12 +2689,12 @@ "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -2731,17 +2731,17 @@ "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -2778,7 +2778,7 @@ "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -3366,17 +3366,17 @@ "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } From fd9b63ae396ce9cd2678ae5db82b51a0c27bc01d Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Tue, 31 Mar 2026 21:55:55 +1100 Subject: [PATCH 04/23] Fix Authorizer Mode panel value mappings for case sensitivity The type label value is capitalized (e.g., "UserClouds") but mappings used lowercase keys. Add both cases to ensure matching. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-overview.json | 12 +++-- charts/controlplane/values.yaml | 5 -- .../controlplane.aws.billing-enable.yaml | 53 +++++++++---------- tests/generated/controlplane.aws.yaml | 53 +++++++++---------- .../controlplane.external-authz.yaml | 53 +++++++++---------- tests/generated/controlplane.userclouds.yaml | 53 +++++++++---------- 6 files changed, 108 insertions(+), 121 deletions(-) diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index 68435645..0106ea99 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -2807,10 +2807,14 @@ { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index b4face91..019c49e9 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -295,11 +295,6 @@ configMap: - authorization - flyte-authorization - x-user-token - # When true, default to user identity if x-user-claim-identitytype - # header is missing (e.g. IdPs without the identitytype custom claim). - # When false, identity resolution fails if the header is absent. - # Defaults to true in selfhosted; BYOC overrides to false. - defaultIdentityToSubject: true cache: identity: enabled: false diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 23f4bd98..d2f2cd8e 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -675,7 +675,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -759,7 +758,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -835,7 +833,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -899,7 +896,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -986,7 +982,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1056,7 +1051,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1126,7 +1120,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -3602,12 +3595,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3884,17 +3877,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3931,17 +3924,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3992,12 +3985,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4034,17 +4027,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4081,7 +4074,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4110,10 +4103,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4669,17 +4666,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index 88ccd041..98396771 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -675,7 +675,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -759,7 +758,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -835,7 +833,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -899,7 +896,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -986,7 +982,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1056,7 +1051,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1126,7 +1120,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -3602,12 +3595,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3884,17 +3877,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3931,17 +3924,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3992,12 +3985,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4034,17 +4027,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4081,7 +4074,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4110,10 +4103,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4669,17 +4666,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.external-authz.yaml b/tests/generated/controlplane.external-authz.yaml index a68fb78d..ac76450b 100644 --- a/tests/generated/controlplane.external-authz.yaml +++ b/tests/generated/controlplane.external-authz.yaml @@ -676,7 +676,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -764,7 +763,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -840,7 +838,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -904,7 +901,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -991,7 +987,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1061,7 +1056,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1131,7 +1125,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -3607,12 +3600,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3889,17 +3882,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -3936,17 +3929,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -3997,12 +3990,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4039,17 +4032,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4086,7 +4079,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4115,10 +4108,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4674,17 +4671,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index 0cbed0b6..60dad826 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -751,7 +751,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -835,7 +834,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -911,7 +909,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -975,7 +972,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1062,7 +1058,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1132,7 +1127,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -1202,7 +1196,6 @@ data: config.yaml: | authorizer: authorizerClient: - defaultIdentityToSubject: true forwardHeaders: - authorization - flyte-authorization @@ -3678,12 +3671,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:update_status:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "UpdateStatus p95", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } @@ -3960,17 +3953,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:cache_hit{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Hits", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Misses", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:get_failure{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Get failures", "refId": "C" } @@ -4007,17 +4000,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(flyte:cacheservice:cache:reservation_contention{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Contention", "refId": "A" }, { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation acquired", "refId": "B" }, { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Reservation released", "refId": "C" } @@ -4068,12 +4061,12 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Allowed", "refId": "A" }, { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Denied", "refId": "B" } @@ -4110,17 +4103,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "authorizer:authorize_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } @@ -4157,7 +4150,7 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "Deny %", "refId": "A" } @@ -4186,10 +4179,14 @@ data: { "type": "value", "options": { - "noop": { "text": "Noop", "index": 0 }, - "userclouds": { "text": "UserClouds", "index": 1 }, - "external": { "text": "External", "index": 2 }, - "authorizer": { "text": "Authorizer", "index": 3 } + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } } } ] @@ -4745,17 +4742,17 @@ data: "type": "timeseries", "targets": [ { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "p50", "refId": "A" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "p90", "refId": "B" }, { - "expr": "usage:messages:processing_time{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "C" } From 09dd0ae1c8fa4063a9d0c6676d3a961e35c19b96 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Tue, 14 Apr 2026 13:31:19 +1000 Subject: [PATCH 05/23] Add OIDC_METADATA_URL global for configurable metadata discovery Flyteadmin defaults to .well-known/oauth-authorization-server (RFC 8414) for OIDC metadata discovery. Entra ID and some other providers only serve .well-known/openid-configuration. This global lets operators override the discovery endpoint without a manual values-overrides.yaml. Default: ".well-known/oauth-authorization-server" (preserves existing behavior). Entra ID: set to ".well-known/openid-configuration". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../controlplane/values.aws.selfhosted-intracluster.yaml | 7 +++++++ .../controlplane/values.gcp.selfhosted-intracluster.yaml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index cdf885ca..20576cca 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -114,6 +114,12 @@ global: # OIDC issuer URL # Example: "https://dev-123456.okta.com/oauth2/default" OIDC_BASE_URL: "" + # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). + # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. + # Default: ".well-known/oauth-authorization-server" (RFC 8414, supported by Okta). + # Set to ".well-known/openid-configuration" for Entra ID or other providers that + # only support OpenID Connect Discovery (RFC 5785). + OIDC_METADATA_URL: ".well-known/oauth-authorization-server" # Flyteadmin OIDC client ID for browser login flow # Example: "0oa1abc2def3ghi4j5k6" OIDC_CLIENT_ID: "" @@ -283,6 +289,7 @@ flyte: authServerType: "External" externalAuthServer: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' thirdPartyConfig: flyteClient: clientId: '{{ .Values.global.CLI_CLIENT_ID }}' diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index eb93fbf0..cbe8f0e7 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -122,6 +122,12 @@ global: # OIDC issuer URL # Example: "https://login.example.com/oauth2/default" OIDC_BASE_URL: "" + # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). + # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. + # Default: ".well-known/oauth-authorization-server" (RFC 8414, supported by Okta). + # Set to ".well-known/openid-configuration" for Entra ID or other providers that + # only support OpenID Connect Discovery (RFC 5785). + OIDC_METADATA_URL: ".well-known/oauth-authorization-server" # Flyteadmin OIDC client ID for browser login flow # Example: "0oa1abc2def3ghi4j5k6" OIDC_CLIENT_ID: "" @@ -313,6 +319,7 @@ flyte: authServerType: "External" externalAuthServer: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' thirdPartyConfig: flyteClient: clientId: '{{ .Values.global.CLI_CLIENT_ID }}' From 0f2f8081201a03778bcb7378ba3fc54f87b37bde Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 06:59:50 +1000 Subject: [PATCH 06/23] Add OAuth2 globals for non-Okta IdP support (Entra ID, Keycloak) New globals in selfhosted-intracluster values: - OIDC_ALLOWED_AUDIENCE: custom JWT audiences for access token validation - OIDC_APP_SCOPE: resource scope for app-specific access tokens - OIDC_APP_AUDIENCE: audience for CLI/SDK PKCE flow Improved documentation on all existing OAuth globals with Okta and Entra ID examples, provider-specific guidance, and cross-references to identityTypeClaimsForApps (configured in values overlay, not global). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 69 ++++++++++++++++--- .../values.gcp.selfhosted-intracluster.yaml | 69 ++++++++++++++++--- charts/controlplane/values.yaml | 15 ++-- 3 files changed, 126 insertions(+), 27 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 20576cca..754702af 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -108,26 +108,73 @@ global: DATAPLANE_ENDPOINT: "" # --- Authentication Configuration --- - # Set all values below to enable OIDC authentication. - # Supports any OAuth2/OIDC provider (Okta, Azure AD, Auth0, Keycloak, etc.) + # Configure your OAuth2/OIDC identity provider below. + # Supports any OIDC-compliant provider (Okta, Azure AD / Entra ID, Keycloak, etc.) # - # OIDC issuer URL - # Example: "https://dev-123456.okta.com/oauth2/default" + # Required for all providers: + # OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID + # INTERNAL_CLIENT_ID, AUTH_TOKEN_URL (in base values.yaml) + # + # Provider-specific (may be required depending on your IdP): + # OIDC_METADATA_URL, OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE + # + # See also: flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps + # for IdP-specific identity type claim mapping (set in values overlay, not as a global). + + # OIDC issuer URL (authorization server base URL). + # This is the base URL for token validation, JWKS discovery, and user info. + # Okta example: "https://dev-123456.okta.com/oauth2/default" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" OIDC_BASE_URL: "" + # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. - # Default: ".well-known/oauth-authorization-server" (RFC 8414, supported by Okta). - # Set to ".well-known/openid-configuration" for Entra ID or other providers that - # only support OpenID Connect Discovery (RFC 5785). + # Most providers support one or both of these endpoints: + # ".well-known/oauth-authorization-server" — RFC 8414 (Okta) + # ".well-known/openid-configuration" — OpenID Connect Discovery (Entra ID, Keycloak) + # Default: ".well-known/oauth-authorization-server" OIDC_METADATA_URL: ".well-known/oauth-authorization-server" - # Flyteadmin OIDC client ID for browser login flow - # Example: "0oa1abc2def3ghi4j5k6" + + # OAuth2 client ID for the browser/web UI login app (confidential client, + # authorization_code grant). This is the "flyteadmin" or "browser" app + # in your IdP's OAuth application configuration. + # Okta example: "0oa1abc2def3ghi4j5k6" + # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" OIDC_CLIENT_ID: "" - # CLI client ID for flytectl / uctl (public OAuth app, PKCE flow) - # Example: "0oa7mno8pqr9stu0v1w2" + + # OAuth2 client ID for the CLI/SDK app (public client, PKCE flow). + # Used by flytectl, uctl, and the Flyte SDK for interactive authentication. + # Okta example: "0oa7mno8pqr9stu0v1w2" + # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" CLI_CLIENT_ID: "" + + # Allowed JWT audiences for access token validation. + # Flyteadmin checks the access token "aud" claim against this list. + # When empty, defaults to ["https://{UNION_HOST}"] (the deployment domain). + # Override for IdPs that use different audience formats in access tokens. + # Okta: typically uses the auth server issuer URL (leave empty to use default). + # Entra ID example: ["api://my-app-name", "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12"] + OIDC_ALLOWED_AUDIENCE: [] + + # OAuth2 resource scope for the flyteadmin app. + # When set, this scope is requested during browser login and CLI PKCE flows + # so the IdP returns access tokens scoped to your app (correct audience). + # Without this, some IdPs (notably Entra ID) return generic access tokens + # with the wrong audience, causing access token validation to fail. + # Okta: leave empty (Okta scopes are configured on the auth server). + # Entra ID example: "api://my-app-name/all" + OIDC_APP_SCOPE: "" + + # Audience identifier for the CLI/SDK PKCE flow. + # Some IdPs require an explicit audience parameter in the authorization request. + # Okta: leave empty (derived from auth server). + # Entra ID example: "api://my-app-name" + OIDC_APP_AUDIENCE: "" + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. # Set them in your environment-specific overlay (Terraform-generated values). + # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). + # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. # ---------------------------------------------------------------------------- # SECTION 2: Image Tag Overrides diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index cbe8f0e7..594586cb 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -116,26 +116,73 @@ global: IMAGE_REPOSITORY_PREFIX: "registry.unionai.cloud/controlplane" # --- Authentication Configuration --- - # Set all values below to enable OIDC authentication. - # Supports any OAuth2/OIDC-compliant identity provider. + # Configure your OAuth2/OIDC identity provider below. + # Supports any OIDC-compliant provider (Okta, Azure AD / Entra ID, Keycloak, etc.) # - # OIDC issuer URL - # Example: "https://login.example.com/oauth2/default" + # Required for all providers: + # OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID + # INTERNAL_CLIENT_ID, AUTH_TOKEN_URL (in base values.yaml) + # + # Provider-specific (may be required depending on your IdP): + # OIDC_METADATA_URL, OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE + # + # See also: flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps + # for IdP-specific identity type claim mapping (set in values overlay, not as a global). + + # OIDC issuer URL (authorization server base URL). + # This is the base URL for token validation, JWKS discovery, and user info. + # Okta example: "https://dev-123456.okta.com/oauth2/default" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" OIDC_BASE_URL: "" + # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. - # Default: ".well-known/oauth-authorization-server" (RFC 8414, supported by Okta). - # Set to ".well-known/openid-configuration" for Entra ID or other providers that - # only support OpenID Connect Discovery (RFC 5785). + # Most providers support one or both of these endpoints: + # ".well-known/oauth-authorization-server" — RFC 8414 (Okta) + # ".well-known/openid-configuration" — OpenID Connect Discovery (Entra ID, Keycloak) + # Default: ".well-known/oauth-authorization-server" OIDC_METADATA_URL: ".well-known/oauth-authorization-server" - # Flyteadmin OIDC client ID for browser login flow - # Example: "0oa1abc2def3ghi4j5k6" + + # OAuth2 client ID for the browser/web UI login app (confidential client, + # authorization_code grant). This is the "flyteadmin" or "browser" app + # in your IdP's OAuth application configuration. + # Okta example: "0oa1abc2def3ghi4j5k6" + # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" OIDC_CLIENT_ID: "" - # CLI client ID for flytectl / uctl (public OAuth app, PKCE flow) - # Example: "0oa7mno8pqr9stu0v1w2" + + # OAuth2 client ID for the CLI/SDK app (public client, PKCE flow). + # Used by flytectl, uctl, and the Flyte SDK for interactive authentication. + # Okta example: "0oa7mno8pqr9stu0v1w2" + # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" CLI_CLIENT_ID: "" + + # Allowed JWT audiences for access token validation. + # Flyteadmin checks the access token "aud" claim against this list. + # When empty, defaults to ["https://{UNION_HOST}"] (the deployment domain). + # Override for IdPs that use different audience formats in access tokens. + # Okta: typically uses the auth server issuer URL (leave empty to use default). + # Entra ID example: ["api://my-app-name", "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12"] + OIDC_ALLOWED_AUDIENCE: [] + + # OAuth2 resource scope for the flyteadmin app. + # When set, this scope is requested during browser login and CLI PKCE flows + # so the IdP returns access tokens scoped to your app (correct audience). + # Without this, some IdPs (notably Entra ID) return generic access tokens + # with the wrong audience, causing access token validation to fail. + # Okta: leave empty (Okta scopes are configured on the auth server). + # Entra ID example: "api://my-app-name/all" + OIDC_APP_SCOPE: "" + + # Audience identifier for the CLI/SDK PKCE flow. + # Some IdPs require an explicit audience parameter in the authorization request. + # Okta: leave empty (derived from auth server). + # Entra ID example: "api://my-app-name" + OIDC_APP_AUDIENCE: "" + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. # Set them in your environment-specific overlay (Terraform-generated values). + # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). + # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. # ---------------------------------------------------------------------------- # SECTION 2: Image Tag Overrides diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 019c49e9..65e818ae 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -54,13 +54,18 @@ global: # Ingress controller provider. Options: "nginx", "envoy", "both" INGRESS_PROVIDER: nginx - # OAuth2 client ID for service-to-service authentication (client_credentials flow). - # Services use this to acquire tokens for internal calls through nginx. - # Example: "0oa3xyz4abc5def6g7h8" + # OAuth2 client ID for service-to-service authentication (client_credentials grant). + # Used by controlplane services (executions, queue, cluster) to acquire tokens + # for internal calls through nginx. This is OAuth App 3 ("internal/service-to-service") + # in the authentication architecture. + # Okta example: "0oa3xyz4abc5def6g7h8" + # Entra ID example: "dc0ea3fc-f32b-4df4-98c1-3681e5a36bc6" INTERNAL_CLIENT_ID: "" - # OAuth2 token endpoint for service-to-service authentication. - # Example: "https://dev-123456.okta.com/oauth2/default/v1/token" + # OAuth2 token endpoint URL for service-to-service authentication. + # Used with INTERNAL_CLIENT_ID for client_credentials token acquisition. + # Okta example: "https://dev-123456.okta.com/oauth2/default/v1/token" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/oauth2/v2.0/token" AUTH_TOKEN_URL: "" # ---------------------------------------------------------------------------- From 082ca8f0347c5390c8329751f03567e0e64e1991 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 07:05:12 +1000 Subject: [PATCH 07/23] Add custom OIDC test fixture for non-Okta IdP configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests the new OAuth2 globals (OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE, OIDC_METADATA_URL) and identityTypeClaimsForApps with generic values — no internal names, customer details, or environment-specific configuration. Snapshot generated but globals not yet wired into chart templates. Template wiring is the next step. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/generated/controlplane.custom-oidc.yaml | 10571 ++++++++++++++++ tests/values/controlplane.custom-oidc.yaml | 56 + 2 files changed, 10627 insertions(+) create mode 100644 tests/generated/controlplane.custom-oidc.yaml create mode 100644 tests/values/controlplane.custom-oidc.yaml diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml new file mode 100644 index 00000000..07c19311 --- /dev/null +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -0,0 +1,10571 @@ +--- +# Source: controlplane/templates/scylla/namespaces.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/operator.pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: scylla-operator + namespace: scylla-operator +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: webhook-server + namespace: scylla-operator +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/console/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: flyteadmin + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyteadmin +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: datacatalog + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: datacatalog +--- +# Source: controlplane/templates/flyte-core-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cacheservice + namespace: union +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: cacheservice +--- +# Source: controlplane/templates/pdb.yaml +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: authorizer +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cluster +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: dataproxy +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: executions +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: queue +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: run-scheduler +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: usage +spec: + minAvailable: "33%" + selector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: envoy-gateway + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/charts/scylla-operator/templates/operator.serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scylla-operator + namespace: scylla-operator + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + namespace: scylla-operator + name: webhook-server + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/cacheservice/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cacheservice + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/console/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- +# Source: controlplane/charts/flyte/templates/admin/secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: flyte-admin-secrets + namespace: union +type: Opaque +stringData: +--- +# Source: controlplane/charts/flyte/templates/common/secret-auth.yaml +apiVersion: v1 +kind: Secret +metadata: + name: flyte-secret-auth + namespace: union +type: Opaque +stringData: + client_secret: foobar +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +data: + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + extensionApis: {} + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + logging: + level: + default: info + provider: + kubernetes: + rateLimitDeployment: + container: + image: docker.io/envoyproxy/ratelimit:3fb70258 + patch: + type: StrategicMerge + value: + spec: + template: + spec: + containers: + - imagePullPolicy: IfNotPresent + name: envoy-ratelimit + shutdownManager: + image: docker.io/envoyproxy/gateway:v1.6.4 + type: Kubernetes +--- +# Source: controlplane/charts/flyte/templates/admin/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-admin-clusters-config + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +data: + clusters.yaml: | + clusters: + clusterConfigs: [] + labelClusterMap: {} +--- +# Source: controlplane/charts/flyte/templates/admin/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-admin-base-config + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +data: + db.yaml: | + database: + connMaxLifeTime: 120s + dbname: flyteadmin + host: '' + maxIdleConnections: 10 + maxOpenConnections: 80 + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + domain.yaml: | + domains: + - id: development + name: development + - id: staging + name: staging + - id: production + name: production + otel.yaml: | + otel: + file: + filename: /tmp/trace.txt + jaeger: + endpoint: http://localhost:14268/api/traces + otlpgrpc: + endpoint: http://localhost:4317 + otlphttp: + endpoint: http://localhost:4318/v1/traces + sampler: + parentSampler: always + type: noop + server.yaml: | + admin: + endpoint: dns:/// + insecure: false + auth: + appAuth: + identityTypeClaimsForApps: + idtyp: + - app + thirdPartyConfig: + flyteClient: + clientId: flytectl + redirectUri: http://localhost:53593/callback + scopes: + - offline + - all + authorizedUris: + - https://localhost:30081 + - http://flyteadmin:80 + - http://flyteadmin.flyte.svc.cluster.local:80 + userAuth: + openId: + baseUrl: https://accounts.google.com + clientId: 657465813211-6eog7ek7li5k7i7fvgv2921075063hpe.apps.googleusercontent.com + scopes: + - profile + - openid + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cloudEvents: + enable: false + connection: + environment: staging + region: '' + rootTenantURLPattern: dns:/// + flyteadmin: + eventVersion: 2 + metadataStoragePrefix: + - metadata + - admin + metricsKeys: + - phase + metricsScope: 'flyte:' + profilerPort: 10254 + roleNameKey: iam.amazonaws.com/role + useOffloadedInputs: true + useOffloadedWorkflowClosure: true + otel: + type: noop + private: + app: + cacheProviderConfig: + kind: bypass + populateUserFields: false + server: + grpc: + port: 8089 + httpPort: 8088 + security: + allowCors: true + allowedHeaders: + - Content-Type + - flyte-authorization + allowedOrigins: + - '*' + secure: false + useAuth: false + sharedService: + connectPort: 8089 + httpPort: 8088 + port: 8089 + selfServeConfig: + legacyHosts: + - '' + union: + internalConnectionConfig: + enabled: true + urlPattern: '_SERVICE_.union.svc.cluster.local:80' + remoteData.yaml: | + remoteData: + region: us-east-1 + scheme: local + signedUrls: + durationMinutes: 3 + storage.yaml: | + storage: + type: s3 + container: "" + connection: + auth-type: iam + region: + enable-multicontainer: false + limits: + maxDownloadMBs: 10 + cache: + max_size_mbs: 1024 + target_gc_percent: 70 + task_resource_defaults.yaml: | + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 2 + gpu: 1 + memory: 1Gi +--- +# Source: controlplane/charts/flyte/templates/console/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-console-config + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm +data: + BASE_URL: /console + CONFIG_DIR: /etc/flyte/config +--- +# Source: controlplane/templates/cacheservice/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cacheservice-config + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/managed-by: Helm +data: + db.yaml: | + database: + connMaxLifeTime: 120s + dbname: cacheservice + host: '' + maxIdleConnections: 10 + maxOpenConnections: 20 + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger.yaml: | + formatter: + type: json + level: 6 + show-source: true + server.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache-server: + grpcPort: 8089 + grpcServerReflection: true + httpPort: 8080 + cacheservice: + heartbeat-grace-period-multiplier: 3 + max-reservation-heartbeat: 30s + metrics-scope: flyte + profiler-port: 10254 + storage-prefix: cached_outputs + otel: + type: noop + private: + app: + cacheProviderConfig: + kind: bypass + union: + internalConnectionConfig: + enabled: true + urlPattern: '_SERVICE_.union.svc.cluster.local:80' + storage.yaml: | + storage: + type: s3 + container: "" + connection: + auth-type: iam + region: + enable-multicontainer: false + limits: + maxDownloadMBs: 10 + cache: + max_size_mbs: 1024 + target_gc_percent: 70 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + bootstrap: + adminUsers: [] + domains: + - development + - staging + - production + maxRetries: 30 + organization: "" + projects: [] + retryInterval: 5s + serviceAccounts: [] + externalClient: + forwardHeaders: + - authorization + - flyte-authorization + internalCommunicationConfig: + enabled: false + type: Noop + useExternalIdentity: 'false' + userCloudsClient: + clientID: 'union-authz-client' + clientSecretName: union/client_secret + enableLogging: true + tenantID: 623771e7-ddd6-4575-bedb-7c970ec75b87 + tenantUrl: http://release-name-union-authz.union.svc.cluster.local:8080 + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'authorizer:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + cloudProvider: + provider: Mock + cluster: + cloudflare: + active: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'cluster:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + dataproxy: + clusterSelector: + type: local + secureTunnelTenantURLPattern: http://ingress-nginx-internal.ingress-nginx.svc.cluster.local:80 + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'dataproxy:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + cloudEventsProcessor: + cloudProvider: Local + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + eventsProxy: + recorderType: RunService + executions: + apps: + enrichIdentities: false + publicURLPattern: https://%s.apps. + llm: + enabled: false + task: + enabled: true + enrichIdentities: false + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'executions:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 + workspace: + enable: false +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + queue: + db: + hosts: + - 'scylla-client.union.svc.cluster.local' + threadCount: 64 + type: cql + eventer: + recordActionThreadCount: 16 + type: runservice + updateActionStatusThreadCount: 16 + sharedService: + metrics: + scope: 'queue:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + cache: + identity: + enabled: false + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + db: + connectionPool: + maxConnectionLifetime: 1m + maxIdleConnections: 20 + maxOpenConnections: 20 + dbname: '' + host: '' + passwordPath: /etc/db/pass.txt + port: 5432 + username: '' + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + metrics: + scope: 'run-scheduler:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 +--- +# Source: controlplane/templates/configmap.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + authorizer: + authorizerClient: + forwardHeaders: + - authorization + - flyte-authorization + - x-user-token + grpcConfig: + host: dns:///authorizer.union.svc.cluster.local:80 + insecure: true + type: Authorizer + useExternalIdentity: 'false' + billing: + enable: false + cache: + identity: + enabled: false + cloudProvider: + provider: Mock + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain + logger: + formatter: + type: json + level: 6 + show-source: true + otel: + type: noop + sharedService: + connectPort: 8081 + metrics: + scope: 'usage:' + selfServeConfig: + legacyHosts: + - '' + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + enable: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' + type: ClientSecret + internalConnectionConfig: + enabled: true + urlPattern: _SERVICE_.union.svc.cluster.local:80 + usage: + taskMetrics: + agentQuery: + mappings: + dgx_job: + queries: + EXECUTION_METRIC_ALLOCATED_CPU_AVG: CPU_ALLOCATION:MEAN + EXECUTION_METRIC_ALLOCATED_MEMORY_BYTES_AVG: MEM_ALLOCATION:MEAN + EXECUTION_METRIC_CPU_UTILIZATION: CPU_UTILIZATION:MEAN + EXECUTION_METRIC_GPU_UTILIZATION: GPU_UTILIZATION:MEAN + EXECUTION_METRIC_MEMORY_UTILIZATION: MEM_UTILIZATION:MEAN + metricDelayToleranceDuration: 0s + promQuery: + queries: + EXECUTION_METRIC_ALLOCATED_CPU_AVG: | + max by (namespace, pod) ( + ( + sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m])) > + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"}) + ) + or + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"}) + ) * + on (namespace, pod) group_left max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_ALLOCATED_MEMORY_BYTES_AVG: | + max by (namespace, pod) ( + ( + sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}) > + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"}) + ) + or + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"}) + ) * + on (namespace, pod) group_left max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_APP_REPLICA_COUNT: | + sum (kube_pod_status_phase{phase=~"Running|Pending", namespace="{{.Namespace}}", pod=~"{{.AppName}}.*"} == 1) or vector(0) + EXECUTION_METRIC_APP_REQUESTS: | + sum(rate(( + envoy_cluster_upstream_rq_xx{ + job="serving-envoy", + project=~"{{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, envoy_response_code_class) + EXECUTION_METRIC_APP_RESPONSE_TIME_P50: | + histogram_quantile(0.5, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_APP_RESPONSE_TIME_P90: | + histogram_quantile(0.90, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_APP_RESPONSE_TIME_P95: | + histogram_quantile(0.95, sum(rate(( + envoy_cluster_upstream_rq_time_bucket{ + job="serving-envoy", + project=~"${{.Project}}", + domain=~"{{.Domain}}", + name=~"{{.AppName}}", + name!=""} + )[5m:])) by (project, domain, name, le)) + EXECUTION_METRIC_CPU_UTILIZATION: | + (sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m])) / + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_GPU_FRAME_BUFFER_UTILIZATION: | + (sum by (namespace, pod, gpu) (DCGM_FI_DEV_FB_USED{namespace="{{.Namespace}}",pod=~"{{.PodName}}"}) / + sum by (namespace, pod, gpu) (DCGM_FI_DEV_FB_USED{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} + DCGM_FI_DEV_FB_FREE{namespace="{{.Namespace}}",pod=~"{{.PodName}}"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_GPU_MEMORY_UTILIZATION: | + sum by (gpu) (DCGM_FI_DEV_MEM_COPY_UTIL{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) / 100.0 + EXECUTION_METRIC_GPU_SM_ACTIVE: | + sum by (gpu) (DCGM_FI_PROF_SM_ACTIVE{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_GPU_SM_OCCUPANCY: | + sum by (gpu) (DCGM_FI_PROF_SM_OCCUPANCY{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_GPU_UTILIZATION: | + sum by (gpu) (DCGM_FI_DEV_GPU_UTIL{namespace="{{.Namespace}}",pod=~"{{.PodName}}"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) / 100.0 + EXECUTION_METRIC_LIMIT_CPU: | + sum by (namespace, pod) (kube_pod_container_resource_limits{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_LIMIT_MEMORY_BYTES: | + sum by (namespace, pod) (kube_pod_container_resource_limits{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_MEMORY_UTILIZATION: | + (sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}) / + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"})) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1) + EXECUTION_METRIC_REQUEST_CPU: | + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="cpu"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_REQUEST_MEMORY_BYTES: | + sum by (namespace, pod) (kube_pod_container_resource_requests{namespace="{{.Namespace}}",pod=~"{{.PodName}}",resource="memory"} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_USED_CPU_AVG: | + sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""}[5m]) * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + EXECUTION_METRIC_USED_MEMORY_BYTES_AVG: | + sum by (namespace, pod) (container_memory_working_set_bytes{namespace="{{.Namespace}}",pod=~"{{.PodName}}",image!=""} * + on (namespace, pod) group_left() max by (namespace, pod) (kube_pod_status_phase{namespace="{{.Namespace}}",pod=~"{{.PodName}}",phase=~"Pending|Running"} == 1)) + workers: 10 +--- +# Source: controlplane/templates/monitoring/dashboard-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-dashboard-union-controlplane-overview + namespace: union + labels: + grafana_dashboard: "1" + app.kubernetes.io/managed-by: Helm +data: + union-controlplane-overview.json: |- + { + "annotations": { + "list": [] + }, + "description": "Union Controlplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Connect Error Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval])) / sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Error %", + "refId": "A" + } + ], + "description": "Fraction of Connect RPC responses with non-OK/non-Canceled codes across all CP services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "none" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Errors by Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "description": "Connect error responses by gRPC status code (Internal, Unavailable, etc.)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(authorizer:handler_panic{namespace=\"$namespace\"} + cluster:handler_panic{namespace=\"$namespace\"} + dataproxy:handler_panic{namespace=\"$namespace\"} + executions:handler_panic{namespace=\"$namespace\"} + queue:handler_panic{namespace=\"$namespace\"} + usage:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics across all CP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. <0 = budget exhausted. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Ingress Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_success_rate or (1 - sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"5..\"}[5m])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress success rate (non-5xx). Customer-facing SLO metric. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Ingress Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_latency_p99 or histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress p99 latency. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "Service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 100, + "title": "Ingress (nginx)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 101, + "title": "Request Rate by Path", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ host }}{{ path }}", + "refId": "A" + } + ], + "description": "Ingress request rate broken down by host and URL path." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 102, + "title": "Error Rate by Status Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (status) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"[45]..\"}[$__rate_interval]))", + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 103, + "title": "Latency p50 / p95 / p99", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 104, + "title": "Active Connections", + "type": "timeseries", + "targets": [ + { + "expr": "sum(nginx_ingress_controller_nginx_process_connections{namespace=\"$namespace\"})", + "legendFormat": "Active", + "refId": "A" + } + ], + "description": "Current number of active client connections to ingress-nginx." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 200, + "title": "Connect / gRPC", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 201, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 202, + "title": "Connect Errors by Service & Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service, code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ service }} {{ code }}", + "refId": "A" + } + ], + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 203, + "title": "gRPC Server Request Rate (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_server_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 204, + "title": "gRPC Server Errors (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_method, grpc_code) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "CacheService gRPC errors by method and code." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 300, + "title": "FlyteAdmin (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 301, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "flyte:admin:node_execution_manager:active_node_executions{namespace=\"$namespace\"}", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "flyte:admin:task_execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 302, + "title": "Execution Create / Event Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:executions_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Executions created", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Workflow events", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:node_execution_manager:node_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Node events", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:task_execution_manager:task_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Task events", + "refId": "D" + } + ], + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 303, + "title": "Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:propeller_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller failures", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:transformer_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Transformer errors", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:execution_manager:publish_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Publish errors", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_termination_failure{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Termination failures", + "refId": "D" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 304, + "title": "Endpoint Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:create_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecution", + "refId": "A" + }, + { + "expr": "flyte:admin:create_execution_event:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecutionEvent", + "refId": "B" + }, + { + "expr": "flyte:admin:get_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "GetExecution", + "refId": "C" + }, + { + "expr": "flyte:admin:list_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "ListExecution", + "refId": "D" + } + ], + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 305, + "title": "Auth Middleware Decisions", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:middleware:authorization:authz_approved{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Approved", + "refId": "A" + }, + { + "expr": "rate(flyte:middleware:authorization:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Denied", + "refId": "B" + } + ], + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "Executions (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 401, + "title": "Execution Create / Ack Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Create", + "refId": "A" + }, + { + "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Ack", + "refId": "B" + } + ], + "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 402, + "title": "Execution Create / Ack Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Create p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Ack p95", + "refId": "B" + } + ], + "description": "Time to prepare create/ack execution requests at p95." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 403, + "title": "Assignment Duration (p50 / p90)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p90", + "refId": "B" + } + ], + "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 404, + "title": "Workqueue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Send ops", + "refId": "A" + }, + { + "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Claims", + "refId": "B" + }, + { + "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Send failures", + "refId": "C" + }, + { + "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Claim failures", + "refId": "D" + } + ], + "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 405, + "title": "DB Operation Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (op) (rate(label_replace({__name__=~\"executions:database:postgres:repositories:execution_ops:.*_count\", namespace=\"$namespace\"}, \"op\", \"$1\", \"__name__\", \"executions:database:postgres:repositories:execution_ops:(.*)_count\")[$__rate_interval:]))", + "legendFormat": "{{ op }}", + "refId": "A" + } + ], + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 406, + "title": "DB Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(executions:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(executions:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 407, + "title": "Cluster Cache Hit/Miss", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:executions:list_clusters:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster hits", + "refId": "A" + }, + { + "expr": "rate(executions:executions:list_clusters:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster miss", + "refId": "B" + }, + { + "expr": "rate(executions:executions:list_nodepools:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool hits", + "refId": "C" + }, + { + "expr": "rate(executions:executions:list_nodepools:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool miss", + "refId": "D" + } + ], + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 408, + "title": "Pending Assignments", + "type": "timeseries", + "targets": [ + { + "expr": "executions:app:leaser:pending_assignment_unlabeled{namespace=\"$namespace\"}", + "legendFormat": "Pending", + "refId": "A" + } + ], + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 409, + "title": "First Ack Latency (V2 SLI)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 410, + "title": "V2 Run Dispatch", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run:runs_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs sent", + "refId": "A" + }, + { + "expr": "rate(executions:run:actions_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Actions sent", + "refId": "B" + }, + { + "expr": "rate(executions:run:enqueue_action_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueue failures", + "refId": "C" + } + ], + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 411, + "title": "V2 Run Notifier", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run_notifier:notifications_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Notifications/s", + "refId": "A" + }, + { + "expr": "executions:run_notifier:subscribers{namespace=\"$namespace\"}", + "legendFormat": "Subscribers", + "refId": "B" + }, + { + "expr": "rate(executions:run:logs:tail_logs_bytes_read{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Log bytes/s", + "refId": "C" + } + ], + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 500, + "title": "Queue / Run-Scheduler (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 13 + }, + "id": 501, + "title": "Metadata Store Counts", + "type": "timeseries", + "targets": [ + { + "expr": "queue:metadata_store:total_run_count{namespace=\"$namespace\"}", + "legendFormat": "Total runs", + "refId": "A" + }, + { + "expr": "queue:metadata_store:total_action_count{namespace=\"$namespace\"}", + "legendFormat": "Total actions", + "refId": "B" + }, + { + "expr": "queue:metadata_store:scheduled_run_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled runs", + "refId": "C" + }, + { + "expr": "queue:metadata_store:scheduled_action_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled actions", + "refId": "D" + } + ], + "description": "Total and scheduled run/action counts in the queue. Shows system load." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 13 + }, + "id": 502, + "title": "Scheduler / Runner / Aborter Throughput", + "type": "timeseries", + "targets": [ + { + "expr": "rate(queue:scheduler:enqueued_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueued", + "refId": "A" + }, + { + "expr": "rate(queue:runner:completed_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Completed", + "refId": "B" + }, + { + "expr": "rate(queue:aborter:aborted_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Aborted", + "refId": "C" + } + ], + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 13 + }, + "id": 503, + "title": "Queue Lengths", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Scheduler input", + "refId": "A" + }, + { + "expr": "queue:runner:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Runner input", + "refId": "B" + }, + { + "expr": "queue:aborter:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Aborter input", + "refId": "C" + }, + { + "expr": "queue:dispatcher:chain_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Dispatcher chain", + "refId": "D" + }, + { + "expr": "queue:db:queue_length{namespace=\"$namespace\"}", + "legendFormat": "DB queue", + "refId": "E" + } + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 21 + }, + "id": 504, + "title": "Dispatcher Operation Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (type, le) (rate(queue:dispatcher:operation_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 21 + }, + "id": 505, + "title": "State Get/Put Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:get_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Get p99", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:put_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Put p99", + "refId": "B" + } + ], + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 21 + }, + "id": 506, + "title": "State Cache & Eventer", + "type": "timeseries", + "targets": [ + { + "expr": "queue:state:active_states{namespace=\"$namespace\"}", + "legendFormat": "Active states", + "refId": "A" + }, + { + "expr": "queue:state:terminal_states{namespace=\"$namespace\"}", + "legendFormat": "Terminal states", + "refId": "B" + }, + { + "expr": "rate(queue:eventer:record_action_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Eventer errors", + "refId": "C" + } + ], + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 29 + }, + "id": 507, + "title": "Worker Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:worker_capacity{namespace=\"$namespace\"}", + "legendFormat": "{{ worker_name }}", + "refId": "A" + } + ], + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 29 + }, + "id": 508, + "title": "Dispatcher Failures by Type", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (type) (rate(queue:dispatcher:operation_failures{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 29 + }, + "id": 509, + "title": "DB & Client Thread Pool", + "type": "timeseries", + "targets": [ + { + "expr": "queue:db:free_threads{namespace=\"$namespace\"}", + "legendFormat": "DB free threads", + "refId": "A" + }, + { + "expr": "queue:queue_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "Queue client free", + "refId": "B" + }, + { + "expr": "queue:state_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "State client free", + "refId": "C" + } + ], + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 600, + "title": "Cluster Service (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 601, + "title": "UpdateStatus / Heartbeat Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:svc:update_status:updates_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "UpdateStatus", + "refId": "A" + }, + { + "expr": "rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat", + "refId": "B" + } + ], + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 602, + "title": "Cluster API Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "UpdateStatus p95", + "refId": "A" + }, + { + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "Heartbeat p95", + "refId": "B" + } + ], + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 603, + "title": "Operator / Propeller Restarts (from DP)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:operator_restarts{namespace=\"$namespace\"}", + "legendFormat": "Operator restarts", + "refId": "A" + }, + { + "expr": "cluster:svc:update_status:propeller_restarts{namespace=\"$namespace\"}", + "legendFormat": "Propeller restarts", + "refId": "B" + } + ], + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 604, + "title": "DB Errors by Type", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(cluster:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(cluster:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Healthy", + "color": "green" + }, + "1": { + "text": "Unhealthy", + "color": "red" + } + } + } + ] + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 605, + "title": "Cluster Health Status", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:unhealthy{namespace=\"$namespace\", subsystem=\"\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 606, + "title": "Last Heartbeat Age (stale cluster detection)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:last_update_age{namespace=\"$namespace\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 607, + "title": "Managed Cluster Cache", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:managed_cluster_client_cache:get:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache hits", + "refId": "A" + }, + { + "expr": "rate(cluster:managed_cluster_client_cache:get:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache miss", + "refId": "B" + } + ], + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 900, + "title": "CacheService (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 901, + "title": "Cache Hit / Miss Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "C" + } + ], + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 902, + "title": "Reservation Contention & Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Contention", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation acquired", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation released", + "refId": "C" + } + ], + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 750, + "title": "Authorizer (V1 + V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 15 + }, + "id": 751, + "title": "Allow / Deny Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Allowed", + "refId": "A" + }, + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Denied", + "refId": "B" + } + ], + "description": "Authorization decision rate. Allow/deny ratio indicates auth health. High deny rate may signal misconfigured policies. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 15 + }, + "id": 752, + "title": "Authorize Latency", + "type": "timeseries", + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "End-to-end Authorize() latency including identity resolution and backend authorization check. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 15 + }, + "id": 753, + "title": "Deny Rate (%)", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Deny %", + "refId": "A" + } + ], + "description": "Percentage of authorization decisions that denied access. Spikes indicate policy changes or auth issues. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "Noop": { "text": "Noop", "index": 0 }, + "noop": { "text": "Noop", "index": 1 }, + "UserClouds": { "text": "UserClouds", "index": 2 }, + "userclouds": { "text": "UserClouds", "index": 3 }, + "External": { "text": "External", "index": 4 }, + "external": { "text": "External", "index": 5 }, + "Authorizer": { "text": "Authorizer", "index": 6 }, + "authorizer": { "text": "Authorizer", "index": 7 } + } + } + ] + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 23 + }, + "id": 760, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^type$/" + }, + "textMode": "value" + }, + "title": "Authorizer Mode", + "type": "stat", + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Currently active authorizer backend type (Noop, UserClouds, External, Authorizer)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 4, + "y": 23 + }, + "id": 761, + "title": "External Backend Latency", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Latency of calls to the external authorization backend (p50/p95/p99)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 23 + }, + "id": 762, + "title": "External Errors by gRPC Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_code) (rate(authorizer:authorizer:cloudauthorizer:connect:external:errors{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_code }}", + "refId": "A" + } + ], + "description": "Error rate from the external authorization backend, broken down by gRPC status code." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 23 + }, + "id": 763, + "title": "Fail-Open Activations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Fail-Open", + "refId": "A" + } + ], + "description": "Rate of fail-open activations. Non-zero means the external backend is unreachable and requests are being allowed without authorization." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 764, + "title": "Decisions by Action", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "allowed: {{ action }}", + "refId": "A" + }, + { + "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "denied: {{ action }}", + "refId": "B" + } + ], + "description": "Authorization decisions broken down by action (e.g. read, write, execute). Stacked to show total volume." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 765, + "title": "Error Attribution", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ error_source }}", + "refId": "A" + } + ], + "description": "Authorization errors attributed by source (e.g. identity resolution, backend, policy evaluation)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 700, + "title": "Data Proxy", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 701, + "title": "Cache Hit/Miss Rates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(dataproxy:domains:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain hits", + "refId": "A" + }, + { + "expr": "rate(dataproxy:domains:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain miss", + "refId": "B" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool hits", + "refId": "C" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool miss", + "refId": "D" + } + ], + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 702, + "title": "Image Read Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Success p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:failure_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Failure p95", + "refId": "B" + } + ], + "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 703, + "title": "Secret Proxy Errors by Cluster", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (cluster, operation) (rate(dataproxy:secrets_service:cluster_errors{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ cluster }} {{ operation }}", + "refId": "A" + } + ], + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 800, + "title": "Usage Service", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 801, + "title": "Billable Usage Reports", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:svc:report_billable_usage{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reports/s", + "refId": "A" + } + ], + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 802, + "title": "Message Pipeline", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:messages:messages_received{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Received", + "refId": "A" + }, + { + "expr": "rate(usage:messages:messages_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sent", + "refId": "B" + }, + { + "expr": "rate(usage:messages:messages_dropped{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Dropped", + "refId": "C" + }, + { + "expr": "rate(usage:messages:messages_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "D" + } + ], + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 803, + "title": "Messages by Type (success)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (message_type) (rate(usage:messages:messages_processed{namespace=\"$namespace\", outcome=\"success\"}[$__rate_interval]))", + "legendFormat": "{{ message_type }}", + "refId": "A" + } + ], + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 804, + "title": "Message Processing Latency", + "type": "timeseries", + "targets": [ + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "controlplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "union", + "value": "union" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "union", + "value": "union" + } + ], + "query": "union", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Controlplane Overview", + "uid": "union-cp-overview", + "version": 2 + } +--- +# Source: controlplane/templates/scylla/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: scylladb +provisioner: ebs.csi.eks.amazonaws.com +volumeBindingMode: WaitForFirstConsumer +parameters: + fsType: ext4 + type: gp2 +reclaimPolicy: Delete +allowVolumeExpansion: true +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + creationTimestamp: null + name: release-name-envoy-gateway-envoy-gateway-role +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + verbs: + - get + - list + - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gatewayclasses + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gatewayclasses/status + verbs: + - update +- apiGroups: + - multicluster.x-k8s.io + resources: + - serviceimports + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + - secrets + - services + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + verbs: + - get + - list + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - list + - watch +- apiGroups: + - gateway.envoyproxy.io + resources: + - envoyproxies + - envoypatchpolicies + - clienttrafficpolicies + - backendtrafficpolicies + - securitypolicies + - envoyextensionpolicies + - backends + - httproutefilters + verbs: + - get + - list + - watch +- apiGroups: + - gateway.envoyproxy.io + resources: + - envoypatchpolicies/status + - clienttrafficpolicies/status + - backendtrafficpolicies/status + - securitypolicies/status + - envoyextensionpolicies/status + - backends/status + verbs: + - update +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways + - grpcroutes + - httproutes + - referencegrants + - tcproutes + - tlsroutes + - udproutes + - backendtlspolicies + verbs: + - get + - list + - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways/status + - grpcroutes/status + - httproutes/status + - tcproutes/status + - tlsroutes/status + - udproutes/status + - backendtlspolicies/status + verbs: + - update +- apiGroups: + - "" + resources: + - pods + - pods/binding + verbs: + - get + - list + - patch + - update + - watch +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-flyteadmin + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + - flyte.lyft.com + - rbac.authorization.k8s.io + resources: + - configmaps + - flyteworkflows + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - spark-role + - limitranges + verbs: + - '*' +--- +# Source: controlplane/charts/scylla-operator/templates/edit_clusterrole.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scyllacluster-edit + labels: + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - create + - patch + - update + - delete + - deletecollection +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:operator +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - "" + resources: + - nodes + - endpoints + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - delete + - get + - list + - patch + - update + - watch + - patch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create +- apiGroups: + - "" + resources: + - configmaps + - endpoints + - namespaces + - secrets + - serviceaccounts + - services + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - remotekubernetesclusters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters/status + - scylladbmonitorings/status + - scylladbdatacenters/status + - remotekubernetesclusters/status + - scylladbclusters/status + - scylladbmanagerclusterregistrations/status + - scylladbmanagertasks/status + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + - roles + - rolebindings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaoperatorconfigs + - scyllaoperatorconfigs/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - monitoring.coreos.com + resources: + - prometheuses + - prometheusrules + - servicemonitors + verbs: + - get + - list + - watch + - create + - patch + - update + - delete +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + - pods/finalizers + verbs: + - update +- apiGroups: + - apps + resources: + - daemonsets/finalizers + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters/finalizers + - scylladbdatacenters/finalizers + - scylladbmonitorings/finalizers + - scylladbmanagerclusterregistrations/finalizers + - scylladbmanagertasks/finalizers + verbs: + - update +- apiGroups: + - policy + resources: + - poddisruptionbudgets/finalizers + verbs: + - update +- apiGroups: + - scylla.scylladb.com + resources: + - nodeconfigs/finalizers + verbs: + - update +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + - pods/finalizers + verbs: + - update +- apiGroups: + - apps + resources: + - daemonsets/finalizers + verbs: + - update +- apiGroups: + - policy + resources: + - poddisruptionbudgets/finalizers + verbs: + - update +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +# Source: controlplane/charts/scylla-operator/templates/operator.clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/operator_remote.clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:operator-remote +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator-remote: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/operator_remote.clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:controller:aggregate-to-operator-remote + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-operator-remote: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scylladbdatacenters + - remoteowners + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - scylla.scylladb.com + resources: + - scylladbdatacenters/status + - remoteowners/status + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - endpoints + - namespaces + - services + - secrets + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scyllacluster-member +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scyllacluster-member + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - "apps" + resources: + - statefulsets + verbs: + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - configmaps/finalizers + - secrets/finalizers + verbs: + - update +--- +# Source: controlplane/charts/scylla-operator/templates/scyllacluster_member_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scyllacluster-member-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylla-member: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_grafana_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:monitoring:grafana +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-grafana: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_grafana_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-grafana-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-grafana: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:monitoring:prometheus +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole_def.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-prometheus + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get +--- +# Source: controlplane/charts/scylla-operator/templates/scylladbmonitoring_prometheus_clusterrole_def_openshift.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scylladb:aggregate-to-scylladb-monitoring-prometheus-openshift + labels: + rbac.operator.scylladb.com/aggregate-to-scylladb-monitoring-prometheus: "true" +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +# Source: controlplane/charts/scylla-operator/templates/view_clusterrole.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scyllacluster-view + labels: + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" + rbac.authorization.k8s.io/aggregate-to-view: "true" +rules: +- apiGroups: + - scylla.scylladb.com + resources: + - scyllaclusters + - scylladbmonitorings + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks + verbs: + - get + - list + - watch +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-envoy-gateway-envoy-gateway-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-envoy-gateway-envoy-gateway-role +subjects: +- kind: ServiceAccount + name: 'envoy-gateway' + namespace: 'union' +--- +# Source: controlplane/charts/flyte/templates/admin/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-flyteadmin-binding + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-flyteadmin +subjects: +- kind: ServiceAccount + name: flyteadmin + namespace: union +--- +# Source: controlplane/charts/scylla-operator/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: scylladb:controller:operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: scylladb:controller:operator +subjects: +- kind: ServiceAccount + name: scylla-operator + namespace: scylla-operator +--- +# Source: controlplane/charts/envoy-gateway/templates/infra-manager-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: release-name-envoy-gateway-infra-manager + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + resources: + - serviceaccounts + - services + - configmaps + verbs: + - create + - get + - list + - delete + - deletecollection + - patch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + verbs: + - create + - get + - delete + - deletecollection + - patch +- apiGroups: + - autoscaling + - policy + resources: + - horizontalpodautoscalers + - poddisruptionbudgets + verbs: + - create + - get + - list + - delete + - deletecollection + - patch +- apiGroups: + - certificates.k8s.io + resources: + - clustertrustbundles + verbs: + - list + - get + - watch +--- +# Source: controlplane/charts/envoy-gateway/templates/leader-election-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: release-name-envoy-gateway-leader-election-role + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + #app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + - flyte.lyft.com + - rbac.authorization.k8s.io + resources: + - configmaps + - flyteworkflows + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - spark-role + verbs: + - '*' +--- +# Source: controlplane/charts/envoy-gateway/templates/infra-manager-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-envoy-gateway-infra-manager + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: 'release-name-envoy-gateway-infra-manager' +subjects: +- kind: ServiceAccount + name: 'envoy-gateway' + namespace: 'union' +--- +# Source: controlplane/charts/envoy-gateway/templates/leader-election-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-envoy-gateway-leader-election-rolebinding + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: 'release-name-envoy-gateway-leader-election-role' +subjects: +- kind: ServiceAccount + name: 'envoy-gateway' + namespace: 'union' +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: flyteadmin-binding + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + #app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: flyteadmin +subjects: + - kind: ServiceAccount + name: flyteadmin + namespace: union +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: envoy-gateway + namespace: 'union' + labels: + control-plane: envoy-gateway + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + selector: + control-plane: envoy-gateway + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + ports: + - name: grpc + port: 18000 + targetPort: 18000 + - name: ratelimit + port: 18001 + targetPort: 18001 + - name: wasm + port: 18002 + targetPort: 18002 + - name: metrics + port: 19001 + targetPort: 19001 + - name: webhook + port: 9443 + targetPort: 9443 +--- +# Source: controlplane/charts/flyte/templates/admin/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8088 + - name: grpc + port: 81 + protocol: TCP + # intentionally set to TCP instead of grpc + targetPort: 8089 + - name: redoc + protocol: TCP + port: 87 + targetPort: 8087 + - name: http-metrics + protocol: TCP + port: 10254 + selector: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/flyte/templates/console/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconsole + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm + annotations: + external-dns.alpha.kubernetes.io/hostname: flyte.example.com + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "600" +spec: + type: ClusterIP + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: scylla-operator + name: scylla-operator-webhook + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +spec: + ports: + - port: 443 + targetPort: 5000 + name: webhook + selector: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +--- +# Source: controlplane/templates/cacheservice/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cacheservice + namespace: union + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: http + port: 88 + protocol: TCP + targetPort: http + - name: grpc + port: 89 + protocol: TCP + targetPort: grpc + - name: http-metrics + protocol: TCP + port: 10254 + selector: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/console/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: unionconsole + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + - port: 8081 + targetPort: http-metrics + protocol: TCP + name: http-metrics + selector: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: authorizer + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cluster + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: dataproxy + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: executions + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: queue + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: run-scheduler + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: usage + labels: + platform.union.ai/prometheus-group: "union-services" + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - name: grpc + port: 80 + protocol: TCP + targetPort: connect + - name: grpc-native + port: 8080 + protocol: TCP + targetPort: grpc + - name: connect + port: 83 + protocol: TCP + targetPort: connect + - name: http + port: 81 + protocol: TCP + targetPort: http + - name: debug + port: 82 + protocol: TCP + targetPort: debug + selector: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway + namespace: 'union' + labels: + control-plane: envoy-gateway + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + control-plane: envoy-gateway + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + prometheus.io/port: "19001" + prometheus.io/scrape: "true" + labels: + control-plane: envoy-gateway + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + spec: + containers: + - args: + - server + - --config-path=/config/envoy-gateway.yaml + env: + - name: ENVOY_GATEWAY_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: KUBERNETES_CLUSTER_DOMAIN + value: cluster.local + image: docker.io/envoyproxy/gateway:v1.6.4 + imagePullPolicy: IfNotPresent + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: envoy-gateway + ports: + - containerPort: 18000 + name: grpc + - containerPort: 18001 + name: ratelimit + - containerPort: 18002 + name: wasm + - containerPort: 19001 + name: metrics + - name: webhook + containerPort: 9443 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + memory: 1024Mi + requests: + cpu: 100m + memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + volumeMounts: + - mountPath: /config + name: envoy-gateway-config + readOnly: true + - mountPath: /certs + name: certs + readOnly: true + imagePullSecrets: [] + serviceAccountName: envoy-gateway + terminationGracePeriodSeconds: 10 + volumes: + - configMap: + defaultMode: 420 + name: envoy-gateway-config + name: envoy-gateway-config + - name: certs + secret: + secretName: envoy-gateway +--- +# Source: controlplane/charts/flyte/templates/admin/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + configChecksum: "676ab3d9a1d5a3d13b441cc308a756dfab35fc748311e7b336490809728d692" + kubectl.kubernetes.io/default-container: flyteadmin + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + initContainers: + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - migrate + - run + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + name: run-migrations + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/flyte/config + name: base-config-volume + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - migrate + - seed-projects + - union-health-monitoring + - flytesnacks + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + name: seed-projects + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/flyte/config + name: base-config-volume + - name: generate-secrets + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + command: ["/bin/sh", "-c"] + args: + [ + "flyteadmin --config=/etc/flyte/config/*.yaml secrets init --localPath /etc/scratch/secrets && flyteadmin --config=/etc/flyte/config/*.yaml secrets create --name flyte-admin-secrets --fromPath /etc/scratch/secrets", + ] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/flyte/config + name: base-config-volume + - mountPath: /etc/scratch + name: scratch + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + containers: + - command: + - flyteadmin + - --config + - /etc/flyte/config/*.yaml + - serve + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + name: flyteadmin + ports: + - containerPort: 8088 + - containerPort: 8089 + - containerPort: 10254 + readinessProbe: + httpGet: + path: /healthcheck + port: 8088 + initialDelaySeconds: 15 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthcheck + port: 8088 + initialDelaySeconds: 20 + timeoutSeconds: 1 + periodSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + resources: + limits: + cpu: 2 + ephemeral-storage: 500Mi + memory: 3Gi + requests: + cpu: 50m + ephemeral-storage: 200Mi + memory: 500Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /srv/flyte + name: shared-data + - mountPath: /etc/flyte/config + name: clusters-config-volume + - mountPath: /etc/secrets/ + name: admin-secrets + - mountPath: /etc/secrets/union + name: union-secrets + readOnly: true + serviceAccountName: flyteadmin + volumes: + - name: union-controlplane-secrets + secret: + secretName: union-controlplane-secrets + - emptyDir: {} + name: shared-data + - emptyDir: {} + name: scratch + - projected: + sources: + - configMap: + name: flyte-admin-base-config + name: base-config-volume + - projected: + sources: + - configMap: + name: flyte-admin-base-config + - configMap: + name: flyte-admin-clusters-config + name: clusters-config-volume + - name: admin-secrets + secret: + secretName: flyte-admin-secrets + - name: union-secrets + secret: + secretName: '' +--- +# Source: controlplane/charts/flyte/templates/console/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconsole + namespace: union + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "2620ed20cf30d64460b231bbcf13fc096a23b6d373b46e69ab5f2e051f3d3d1" + linkerd.io/inject: disabled + prometheus.io/scrape: "false" + labels: + app.kubernetes.io/name: flyteconsole + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + app.kubernetes.io/managed-by: Helm + spec: + securityContext: + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1000 + seLinuxOptions: + type: spc_t + containers: + - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/flyteconsole:" + imagePullPolicy: "IfNotPresent" + name: flyteconsole + envFrom: + - configMapRef: + name: flyte-console-config + ports: + - containerPort: 8080 + env: + - name: ENABLE_GA + value: "true" + - name: GA_TRACKING_ID + value: "" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 250m + ephemeral-storage: 200Mi + memory: 250Mi + requests: + cpu: 10m + ephemeral-storage: 20Mi + memory: 50Mi + volumeMounts: + - mountPath: /srv/flyte + name: shared-data + volumes: + - emptyDir: {} + name: shared-data +--- +# Source: controlplane/charts/scylla-operator/templates/operator.deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: scylla-operator + namespace: scylla-operator + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator +spec: + replicas: 2 + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator + template: + metadata: + labels: + app.kubernetes.io/name: scylla-operator + app.kubernetes.io/instance: scylla-operator + spec: + serviceAccountName: scylla-operator + containers: + - name: scylla-operator + image: scylladb/scylla-operator:1.18.1 + imagePullPolicy: IfNotPresent + env: + - name: SCYLLA_OPERATOR_IMAGE + value: scylladb/scylla-operator:1.18.1 + args: + - operator + - --loglevel=2 + resources: + requests: + cpu: 100m + memory: 20Mi + terminationGracePeriodSeconds: 10 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: scylla-operator + app.kubernetes.io/name: scylla-operator + topologyKey: kubernetes.io/hostname + weight: 1 +--- +# Source: controlplane/charts/scylla-operator/templates/webhookserver.deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: scylla-operator + name: webhook-server + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server +spec: + replicas: 2 + strategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server + template: + metadata: + labels: + app.kubernetes.io/name: webhook-server + app.kubernetes.io/instance: webhook-server + spec: + serviceAccountName: "webhook-server" + containers: + - name: webhook-server + image: scylladb/scylla-operator:1.18.1 + imagePullPolicy: IfNotPresent + args: + - run-webhook-server + - --loglevel=2 + - --tls-cert-file=/tmp/serving-certs/tls.crt + - --tls-private-key-file=/tmp/serving-certs/tls.key + livenessProbe: + httpGet: + path: /readyz + port: 5000 + scheme: HTTPS + readinessProbe: + httpGet: + path: /readyz + port: 5000 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 10 + lifecycle: + preStop: + exec: + command: + - /usr/bin/sleep + - 15s + ports: + - containerPort: 5000 + name: webhook-server + protocol: TCP + resources: + requests: + cpu: 10m + memory: 20Mi + volumeMounts: + - mountPath: /tmp/serving-certs + name: cert + readOnly: true + terminationGracePeriodSeconds: 75 + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: scylla-operator-serving-cert + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: webhook-server + app.kubernetes.io/name: webhook-server + topologyKey: kubernetes.io/hostname + weight: 1 +--- +# Source: controlplane/templates/cacheservice/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cacheservice + namespace: union + labels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "9c88958e2c6c93925c335fa455ed035dc6c32e92bead61445ca6463e40c19cc" + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + + app.kubernetes.io/name: cacheservice + app.kubernetes.io/instance: release-name + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/managed-by: Helm + spec: + securityContext: + fsGroup: 1001 + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + initContainers: + - command: + - cacheservice + - --config + - /etc/cacheservice/config/*.yaml + - migrate + - run + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + name: run-migrations + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/cacheservice/config + name: config-volume + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + containers: + - command: + - cacheservice + - --config + - /etc/cacheservice/config/*.yaml + - serve + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + imagePullPolicy: "IfNotPresent" + name: cacheservice + ports: + - name: http + containerPort: 8088 + - name: grpc + containerPort: 8089 + - name: http-metrics + containerPort: 10254 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + limits: + cpu: 1 + ephemeral-storage: 200Mi + requests: + cpu: 500m + ephemeral-storage: 200Mi + memory: 200Mi + volumeMounts: + - mountPath: /etc/db + name: union-controlplane-secrets + - mountPath: /etc/cacheservice/config + name: config-volume + - mountPath: /etc/secrets/union + name: union-secrets + readOnly: true + serviceAccountName: cacheservice + volumes: + - name: union-controlplane-secrets + secret: + secretName: union-controlplane-secrets + - emptyDir: {} + name: shared-data + - configMap: + name: cacheservice-config + name: config-volume + - name: union-secrets + secret: + secretName: '' + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: cacheservice + topologyKey: kubernetes.io/hostname +--- +# Source: controlplane/templates/console/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: unionconsole + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + platform.union.ai/zone: "controlplane" + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: unionconsole + securityContext: + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + runAsUser: 1000 + seLinuxOptions: + type: spc_t + containers: + - name: unionconsole + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/unionconsole:2026.4.5" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + protocol: TCP + - name: http-metrics + containerPort: 8081 + protocol: TCP + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi +--- +# Source: controlplane/templates/deployment.yaml +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: authorizer + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: authorizer + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: authorizer + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: authorizer + containers: + - name: authorizer + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - authorizer + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: authorizer + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: cluster + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: cluster + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: cluster + initContainers: + - name: cluster-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudcluster + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: cluster + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudcluster + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: cluster + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataproxy + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: dataproxy + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: dataproxy + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: dataproxy + containers: + - name: dataproxy + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - dataproxy + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: dataproxy + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executions + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: executions + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: executions + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: executions + initContainers: + - name: executions-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: executions + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: executions + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: queue + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: queue + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: queue + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: queue + initContainers: + - name: queue-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - queue + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: queue + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - queue + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: queue + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: run-scheduler + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: run-scheduler + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: run-scheduler + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: run-scheduler + initContainers: + - name: run-scheduler-migrate + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - migrate + - --config + - /etc/config/*.yaml + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + containers: + - name: run-scheduler + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - cloudpropeller + - scheduler + - start + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: run-scheduler + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: usage + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: usage + linkerd.io/inject: disabled + prometheus.io/path: /metrics + prometheus.io/port: "10254" + labels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: usage + volumes: + - name: secrets + secret: + secretName: + - name: db-pass + secret: + secretName: + - name: config + configMap: + name: usage + containers: + - name: usage + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + imagePullPolicy: IfNotPresent + args: + - usage + - serve + - --config + - /etc/config/*.yaml + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + - name: connect + containerPort: 8081 + protocol: TCP + volumeMounts: + - name: db-pass + mountPath: /etc/db + - name: secrets + mountPath: /etc/secrets/union + - name: config + mountPath: /etc/config/ + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: "1" + resource: limits.cpu + resources: + limits: + cpu: 3 + memory: 512Mi + requests: + cpu: 500m + memory: 250Mi + livenessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /healthcheck + port: debug + initialDelaySeconds: 3 + periodSeconds: 3 + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: usage + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: controlplane/charts/flyte/templates/admin/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: flyteadmin + namespace: union + labels: + app.kubernetes.io/name: flyteadmin + app.kubernetes.io/instance: release-name + helm.sh/chart: flyte-v1.16.1 + #app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteadmin + minReplicas: 1 + maxReplicas: 10 + metrics: + + - resource: + name: cpu + target: + averageUtilization: 80 + type: Utilization + type: Resource + - resource: + name: memory + target: + averageUtilization: 80 + type: Utilization + type: Resource +--- +# Source: controlplane/templates/console/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: unionconsole + labels: + helm.sh/chart: controlplane-2026.4.5 + app.kubernetes.io/name: unionconsole + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: unionconsole + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: authorizer +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: authorizer + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: cluster +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: cluster + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: dataproxy +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: dataproxy + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: executions +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: executions + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: run-scheduler +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: run-scheduler + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: usage +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: usage + minReplicas: 1 + maxReplicas: 1 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-dataproxy + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /data/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /data + pathType: Prefix + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-usage-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me + nginx.ingress.kubernetes.io/use-regex: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /cloudidl.usage.UsageService(/(?!GetCustomMeasuresNames|GetMeasureGroup|GetMeasureGroups|GetBillableMeasures|GetBillingInfo|ReportBillableUsage|ReportServerlessBillableUsage|CreateCustomer|AttachBillingPlanToCustomer|GetCustomerCredits|EnqueueMetronomeRequest|EnqueueStripeRequest|GetOrgCheckoutSession).*|$) + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-usage + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service + nginx.ingress.kubernetes.io/use-regex: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /usage/api/v1(/(?!custom_measures_names|measure_group|measure_groups|billable_measures|billing_info|report_billable_usage|customer_credits|checkout_session).*|$) + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /api + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /api/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /v1/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /cloudadmin + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /cloudadmin/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /actor + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /actor/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /agent + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /agent/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /dataplane + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /dataplane/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /spark-history-server + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /spark-history-server/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /api/v1/dataproxy + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /api/v1/dataproxy/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: http + - path: /app + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /app/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /apps + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /apps/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: http + - path: /cluster + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /cluster/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterpool + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterpool/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterconfig + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /clusterconfig/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /org + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: http + - path: /org/* + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: http + - path: /managed_cluster + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /managed_cluster/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: http + - path: /authorizer + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: http + - path: /authorizer/* + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: http + - path: /oauth_app + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /oauth_app/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /users + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /users/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /members + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /members/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /roles + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /roles/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /policies + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /policies/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /identities + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /identities/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: http + - path: /echo + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /echo/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /execution + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /execution/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_registry + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_registry/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_instance + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /workspace_instance/* + pathType: ImplementationSpecific + backend: + service: + name: execution + port: + name: http + - path: /usage + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /usage/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /cloudidl.execution.ExecutionService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.execution.ExecutionService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.cluster.ClusterService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterNodepoolService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ClusterNodepoolService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.apikey.APIKeyService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.apikey.APIKeyService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.AppsService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.AppsService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.org.OrgService/* + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: grpc + - path: /cloudidl.org.OrgService + pathType: ImplementationSpecific + backend: + service: + name: organizations + port: + name: grpc + - path: /cloudidl.cloudaccounts.CloudAccountsService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: grpc + - path: /cloudidl.cloudaccounts.CloudAccountsService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: grpc + - path: /cloudidl.cluster.ManagedClusterService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.cluster.ManagedClusterService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.identity.UserService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.UserService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.MemberService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.MemberService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.RoleService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.RoleService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.PolicyService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.PolicyService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: connect + - path: /cloudidl.identity.SelfServe/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.SelfServe + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.identity.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: identity + port: + name: grpc + - path: /cloudidl.clusterpool.ClusterPoolService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterpool.ClusterPoolService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterconfig.ClusterConfigService/* + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.clusterconfig.ClusterConfigService + pathType: ImplementationSpecific + backend: + service: + name: cluster + port: + name: connect + - path: /cloudidl.authorizer.AuthorizerService/* + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: connect + - path: /cloudidl.authorizer.AuthorizerService + pathType: ImplementationSpecific + backend: + service: + name: authorizer + port: + name: connect + - path: /cloudidl.usage.UsageService/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect + - path: /cloudidl.usage.UsageService + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: connect + - path: /datacatalog.DataCatalog/* + pathType: ImplementationSpecific + backend: + service: + name: datacatalog + port: + name: grpc + - path: /datacatalog.DataCatalog + pathType: ImplementationSpecific + backend: + service: + name: datacatalog + port: + name: grpc + - path: /flyteidl.cacheservice.CacheService/* + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.CacheService + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.v2.CacheService/* + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /flyteidl.cacheservice.v2.CacheService + pathType: ImplementationSpecific + backend: + service: + name: cacheservice + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.agent.AgentService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.agent.AgentService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.secret.SecretService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.secret.SecretService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.secret.SecretService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.secret.SecretService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.support.SupportService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.clouddataproxy.CloudDataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.clouddataproxy.CloudDataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl.service.DataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl.service.DataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.dataproxy.DataProxyService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.dataproxy.DataProxyService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.logs.LogsService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.logs.LogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceRegistryService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceRegistryService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TranslatorService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TranslatorService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TriggerService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TriggerService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.QueueService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + + - path: /flyteidl2.workflow.RunService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.RunService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.TranslatorService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.TranslatorService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.QueueService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.trigger.TriggerService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.trigger.TriggerService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.StateService + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.StateService/* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.imagebuilder.ImageService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.imagebuilder.ImageService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.imagebuilder.ImageService + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.imagebuilder.ImageService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.AppService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppLogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.ReplicaService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.AppService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppLogsService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.ReplicaService/* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-protected-grpc-streaming + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC + nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /flyteidl2.auth.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.project.ProjectService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.project.ProjectService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AdminService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AdminService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + + - path: /flyteidl.service.WatchService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + + - path: /flyteidl.service.WatchService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.cloudadmin.CloudAdminService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.cloudadmin.CloudAdminService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.IdentityService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.IdentityService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.echo.EchoService/* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.echo.EchoService + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl.service.SignalService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.SignalService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /cloudidl.actor.ActorEnvironmentService/Stream* + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.execution.ExecutionService/GetExecutionOperation + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.RunService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/Record* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.InternalRunService/Update* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.TaskService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.workflow.LeaseService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.StateService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.QueueService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.workflow.LeaseService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + + - path: /flyteidl2.workflow.RunLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.RunService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.task.TaskService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/Heartbeat + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.StateService/Watch* + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /flyteidl2.workflow.QueueService/StreamLeases + pathType: ImplementationSpecific + backend: + service: + name: queue + port: + name: grpc + - path: /cloudidl.logs.LogsService/TailTaskExecutionLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.workspace.WorkspaceInstanceService/WatchWorkspaceInstances + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppService/Watch + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppService/Lease + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /cloudidl.app.AppLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /cloudidl.app.ReplicaService/WatchReplicas + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.AppService/Watch + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppService/Lease + pathType: ImplementationSpecific + backend: + service: + name: executions + port: + name: grpc + - path: /flyteidl2.app.AppLogsService/TailLogs + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc + - path: /flyteidl2.app.ReplicaService/WatchReplicas + pathType: ImplementationSpecific + backend: + service: + name: dataproxy + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + # Port 87 in FlyteAdmin maps to the redoc container. + - path: /openapi + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: redoc + - path: /healthcheck + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /healthz + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /me + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + # Port 87 in FlyteAdmin maps to the redoc container. + - path: /openapi/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: redoc + - path: /.well-known + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /.well-known/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /login + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /login/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /logout + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /logout/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /callback + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /callback/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /config + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /config/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /oauth2 + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /oauth2/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /auth + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /auth/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: http + - path: /enqueue_metronome_request/v1 + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_metronome_request/v1/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_stripe_request/v1 + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http + - path: /enqueue_stripe_request/v1/* + pathType: ImplementationSpecific + backend: + service: + name: usage + port: + name: http +--- +# Source: controlplane/templates/flyte-core-app.yaml +# Certain ingress controllers like nginx cannot serve HTTP 1 and GRPC with a single ingress because GRPC can only +# enabled on the ingress object, not on backend services (GRPC annotation is set on the ingress, not on the services). +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-grpc + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. + - path: /flyteidl.service.HealthService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.HealthService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AuthMetadataService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl.service.AuthMetadataService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.AuthMetadataService + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc + - path: /flyteidl2.auth.AuthMetadataService/* + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-grpc-streaming + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/backend-protocol: GRPC +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: /flyteidl.service.WatchService/WatchExecutionStatusUpdates + pathType: ImplementationSpecific + backend: + service: + name: flyteadmin + port: + name: grpc +--- +# Source: controlplane/templates/flyte-core-app.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: controlplane-console-protected + namespace: union + annotations: + nginx.ingress.kubernetes.io/app-root: /v2 + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/proxy-body-size: 6m + nginx.ingress.kubernetes.io/proxy-buffer-size: 32k + nginx.ingress.kubernetes.io/proxy-buffers: 4 32k + nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host + nginx.ingress.kubernetes.io/server-snippet: | + client_header_timeout 604800; + client_body_timeout 604800; + # Increasing the default configuration from + # client_header_buffer_size 1k; + # large_client_header_buffers 4 8k; + # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason + # about expected header sizs (PE-1101). + # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller + # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 + # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 + # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. + # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. + client_header_buffer_size 16k; + large_client_header_buffers 64 32k; + nginx.ingress.kubernetes.io/service-upstream: "true" + nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://$host/me + nginx.org/websocket-services: dataproxy-service +spec: + ingressClassName: "controlplane" + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret + rules: + - host: fake-host.domain + http: + paths: + - path: / + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + # NOTE: If you change this, you must update the BASE_URL value in flyteconsole.yaml + - path: /console + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /console/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /dashboard + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /dashboard/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /resources + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /resources/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /cost + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /cost/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /loading + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /loading/* + pathType: ImplementationSpecific + backend: + service: + name: flyteconsole + port: + name: http + - path: /v2 + pathType: ImplementationSpecific + backend: + service: + name: unionconsole + port: + name: http + - path: /v2/* + pathType: ImplementationSpecific + backend: + service: + name: unionconsole + port: + name: http +--- +# Source: controlplane/charts/scylla-operator/templates/validatingwebhook.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + annotations: + cert-manager.io/inject-ca-from: scylla-operator/scylla-operator-serving-cert + name: scylla-operator +webhooks: +- name: webhook.scylla.scylladb.com + clientConfig: + service: + name: scylla-operator-webhook + namespace: scylla-operator + path: /validate + admissionReviewVersions: + - v1 + sideEffects: None + failurePolicy: Fail + rules: + - apiGroups: + - scylla.scylladb.com + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - scyllaclusters + - apiGroups: + - scylla.scylladb.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - nodeconfigs + - scyllaoperatorconfigs + - scylladbdatacenters + - scylladbclusters + - scylladbmanagerclusterregistrations + - scylladbmanagertasks +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/templates/secret.yaml +--- +--- +# Source: controlplane/charts/scylla-operator/templates/certificate.yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: scylla-operator-serving-cert + namespace: scylla-operator +spec: + dnsNames: + - scylla-operator-webhook.scylla-operator.svc + issuerRef: + kind: Issuer + name: scylla-operator-selfsigned-issuer + secretName: scylla-operator-serving-cert +--- +# Source: controlplane/charts/scylla-operator/templates/issuer.yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: scylla-operator-selfsigned-issuer + namespace: scylla-operator +spec: + selfSigned: {} +--- +# Source: controlplane/charts/scylla/templates/scyllacluster.yaml +apiVersion: scylla.scylladb.com/v1 +kind: ScyllaCluster +metadata: + name: scylla + namespace: union +spec: + version: 2025.1.5 + agentVersion: 3.5.1@sha256:d1b57d08b9949c8faad2048fdf4dc7c502dae81da856c3c6b3a77dd347d5c7fc + repository: scylladb/scylla + agentRepository: scylladb/scylla-manager-agent + developerMode: true + sysctls: + - fs.aio-max-nr=30000000 + datacenter: + name: dc1 + racks: + - agentResources: + requests: + cpu: 50m + memory: 10M + members: 3 + name: rack1 + placement: + nodeAffinity: {} + tolerations: [] + resources: + limits: + cpu: 2 + memory: 4Gi + requests: + cpu: 1 + memory: 2Gi + storage: + capacity: 100Gi + storageClassName: scylladb +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: release-name-envoy-gateway-certgen + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: 'release-name-envoy-gateway-certgen:union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. +rules: + - apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + verbs: + - get + - list + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + resourceNames: + - 'envoy-gateway-topology-injector.union' + verbs: + - update + - patch +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: 'release-name-envoy-gateway-certgen:union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: 'release-name-envoy-gateway-certgen:union' +subjects: + - kind: ServiceAccount + name: 'release-name-envoy-gateway-certgen' + namespace: 'union' +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: release-name-envoy-gateway-certgen + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - create + - update +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-envoy-gateway-certgen + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: 'release-name-envoy-gateway-certgen' +subjects: +- kind: ServiceAccount + name: 'release-name-envoy-gateway-certgen' + namespace: 'union' +--- +# Source: controlplane/charts/envoy-gateway/templates/certgen.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: release-name-envoy-gateway-certgen + namespace: 'union' + labels: + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm + annotations: + "helm.sh/hook": pre-install, pre-upgrade +spec: + backoffLimit: 1 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: certgen + spec: + containers: + - command: + - envoy-gateway + - certgen + env: + - name: ENVOY_GATEWAY_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: KUBERNETES_CLUSTER_DOMAIN + value: cluster.local + image: docker.io/envoyproxy/gateway:v1.6.4 + imagePullPolicy: IfNotPresent + name: envoy-gateway-certgen + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault + imagePullSecrets: [] + restartPolicy: Never + serviceAccountName: release-name-envoy-gateway-certgen + ttlSecondsAfterFinished: 30 +--- +# Source: controlplane/charts/envoy-gateway/templates/envoy-proxy-topology-injector-webhook.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: 'envoy-gateway-topology-injector.union' + annotations: + "helm.sh/hook": pre-install, pre-upgrade + "helm.sh/hook-weight": "-1" + labels: + app.kubernetes.io/component: topology-injector + helm.sh/chart: envoy-gateway-v1.6.4 + app.kubernetes.io/name: envoy-gateway + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/managed-by: Helm +webhooks: + - name: topology.webhook.gateway.envoyproxy.io + admissionReviewVersions: ["v1"] + sideEffects: None + clientConfig: + service: + name: envoy-gateway + namespace: 'union' + path: "/inject-pod-topology" + port: 9443 + failurePolicy: Ignore + rules: + - operations: ["CREATE"] + apiGroups: [""] + apiVersions: ["v1"] + resources: ["pods/binding"] + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: In + values: + - union diff --git a/tests/values/controlplane.custom-oidc.yaml b/tests/values/controlplane.custom-oidc.yaml new file mode 100644 index 00000000..300ba1b9 --- /dev/null +++ b/tests/values/controlplane.custom-oidc.yaml @@ -0,0 +1,56 @@ +# Test fixture: Custom OIDC provider configuration. +# Exercises the OAuth2 globals for non-Okta IdPs (e.g. Entra ID, Keycloak). +# All values use generic, non-internal names. + +global: + INTERNAL_CLIENT_ID: "test-internal-client-id" + AUTH_TOKEN_URL: "https://idp.example.com/oauth2/v2.0/token" + OIDC_BASE_URL: "https://idp.example.com/oauth2/v2.0" + OIDC_CLIENT_ID: "00000000-1111-2222-3333-444444444444" + CLI_CLIENT_ID: "55555555-6666-7777-8888-999999999999" + OIDC_METADATA_URL: ".well-known/openid-configuration" + OIDC_ALLOWED_AUDIENCE: + - "api://my-app" + - "00000000-1111-2222-3333-444444444444" + OIDC_APP_SCOPE: "api://my-app/all" + OIDC_APP_AUDIENCE: "api://my-app" + +dbHost: "db-instance-url" +dbName: "dbName" +dbUser: "dbUser" +dbPass: "dbPass" +bucketName: "bucketName" +artifactsBucketName: "artifactsBucketName" + +configMap: + connection: + environment: staging + region: us-east-2 + rootTenantURLPattern: dns:///fake-host.domain +controlplane: + enabled: true +ingress: + host: fake-host.domain + tls: + - hosts: + - fake-host.domain + secretName: fake-host-tls-secret +flyte: + common: + ingress: + tls: + secretName: fake-host-tls-secret + host: fake-host.domain + configmap: + admin: + admin: + endpoint: dns:///fake-host.domain + insecure: false + adminServer: + auth: + appAuth: + # Identity type claim mapping for non-Okta IdPs. + # This is set in values overlay, not via a global. + identityTypeClaimsForApps: + idtyp: + - app From 5465a6e6459ba25547f0468546a82e22f305aec0 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 07:17:31 +1000 Subject: [PATCH 08/23] Wire OIDC_APP_SCOPE and OIDC_APP_AUDIENCE globals into chart templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flyteClient.scopes uses OIDC_APP_SCOPE (default: "all"). flyteClient.audience uses OIDC_APP_AUDIENCE (default: ""). Test fixture updated with selfhosted-intracluster overlay to verify. openId.scopes and allowedAudience remain in base values — terraform handles appending app_scope and setting IdP-specific audiences via the direct merge path (lists can't be conditionally built in static YAML). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 4 +- .../values.gcp.selfhosted-intracluster.yaml | 4 +- tests/generated/controlplane.custom-oidc.yaml | 1297 ++++++++--------- tests/values/controlplane.custom-oidc.yaml | 1 + 4 files changed, 598 insertions(+), 708 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 754702af..cbae4cbb 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -341,7 +341,9 @@ flyte: flyteClient: clientId: '{{ .Values.global.CLI_CLIENT_ID }}' redirectUri: "http://localhost:53593/callback" - scopes: ["all"] + scopes: + - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' + audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' userAuth: openId: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index 594586cb..70a9b67a 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -371,7 +371,9 @@ flyte: flyteClient: clientId: '{{ .Values.global.CLI_CLIENT_ID }}' redirectUri: "http://localhost:53593/callback" - scopes: ["all"] + scopes: + - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' + audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' userAuth: openId: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml index 07c19311..bb61fd1f 100644 --- a/tests/generated/controlplane.custom-oidc.yaml +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -170,19 +170,6 @@ spec: app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name --- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: envoy-gateway - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm ---- # Source: controlplane/charts/flyte/templates/admin/rbac.yaml apiVersion: v1 kind: ServiceAccount @@ -194,6 +181,26 @@ metadata: app.kubernetes.io/instance: release-name helm.sh/chart: flyte-v1.16.1 #app.kubernetes.io/managed-by: Helm + annotations: + eks.amazonaws.com/role-arn: '' +imagePullSecrets: + - name: union-registry-secret +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union +automountServiceAccountToken: true --- # Source: controlplane/charts/scylla-operator/templates/operator.serviceaccount.yaml apiVersion: v1 @@ -226,6 +233,10 @@ metadata: app.kubernetes.io/instance: release-name helm.sh/chart: controlplane-2026.4.5 app.kubernetes.io/managed-by: Helm + annotations: + eks.amazonaws.com/role-arn: '' +imagePullSecrets: + - name: union-registry-secret --- # Source: controlplane/templates/console/serviceaccount.yaml apiVersion: v1 @@ -354,47 +365,7 @@ metadata: namespace: union type: Opaque stringData: - client_secret: foobar ---- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-config.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -data: - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - extensionApis: {} - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - logging: - level: - default: info - provider: - kubernetes: - rateLimitDeployment: - container: - image: docker.io/envoyproxy/ratelimit:3fb70258 - patch: - type: StrategicMerge - value: - spec: - template: - spec: - containers: - - imagePullPolicy: IfNotPresent - name: envoy-ratelimit - shutdownManager: - image: docker.io/envoyproxy/gateway:v1.6.4 - type: Kubernetes + client_secret: placeholder --- # Source: controlplane/charts/flyte/templates/admin/configmap.yaml apiVersion: v1 @@ -458,31 +429,43 @@ data: type: noop server.yaml: | admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/client_secret endpoint: dns:/// - insecure: false + insecure: true auth: appAuth: + authServerType: External + externalAuthServer: + baseUrl: 'https://idp.example.com/oauth2/v2.0' + metadataUrl: '.well-known/openid-configuration' identityTypeClaimsForApps: idtyp: - app thirdPartyConfig: flyteClient: - clientId: flytectl + audience: 'api://my-app' + clientId: '55555555-6666-7777-8888-999999999999' redirectUri: http://localhost:53593/callback scopes: - - offline - - all + - 'api://my-app/all' authorizedUris: - - https://localhost:30081 - http://flyteadmin:80 - - http://flyteadmin.flyte.svc.cluster.local:80 + - http://flyteadmin.union.svc.cluster.local:80 + grpcAuthorizationHeader: flyte-authorization + httpAuthorizationHeader: flyte-authorization userAuth: + cookieSetting: + domain: "" + sameSitePolicy: LaxMode + idpQueryParameter: idp openId: - baseUrl: https://accounts.google.com - clientId: 657465813211-6eog7ek7li5k7i7fvgv2921075063hpe.apps.googleusercontent.com + baseUrl: 'https://idp.example.com/oauth2/v2.0' + clientId: '00000000-1111-2222-3333-444444444444' scopes: - profile - openid + - offline_access authorizer: authorizerClient: forwardHeaders: @@ -536,6 +519,8 @@ data: connectPort: 8089 httpPort: 8088 port: 8089 + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -587,6 +572,30 @@ data: BASE_URL: /console CONFIG_DIR: /etc/flyte/config --- +# Source: controlplane/charts/ingress-nginx/templates/controller-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +data: + allow-snippet-annotations: "true" + annotations-risk-level: "Critical" + grpc-connect-timeout: "1200" + grpc-read-timeout: "604800" + grpc-send-timeout: "604800" + proxy-connect-timeout: "60" + proxy-read-timeout: "3600" + proxy-send-timeout: "3600" +--- # Source: controlplane/templates/cacheservice/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -674,6 +683,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -726,6 +740,8 @@ data: connectPort: 8081 metrics: scope: 'authorizer:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -739,6 +755,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -757,6 +776,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -801,6 +825,8 @@ data: connectPort: 8081 metrics: scope: 'cluster:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -814,6 +840,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -832,6 +861,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -853,7 +887,7 @@ data: dataproxy: clusterSelector: type: local - secureTunnelTenantURLPattern: http://ingress-nginx-internal.ingress-nginx.svc.cluster.local:80 + secureTunnelTenantURLPattern: '' logger: formatter: type: json @@ -864,6 +898,8 @@ data: sharedService: metrics: scope: 'dataproxy:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -877,6 +913,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -895,6 +934,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -928,6 +972,17 @@ data: eventsProxy: recorderType: RunService executions: + app: + adminClient: + connection: + authorizationHeader: flyte-authorization + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true + scopes: + - all + tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' apps: enrichIdentities: false publicURLPattern: https://%s.apps. @@ -946,6 +1001,8 @@ data: sharedService: metrics: scope: 'executions:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -959,6 +1016,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -979,6 +1039,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -1017,6 +1082,8 @@ data: sharedService: metrics: scope: 'queue:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -1030,6 +1097,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -1048,6 +1118,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -1086,6 +1161,8 @@ data: sharedService: metrics: scope: 'run-scheduler:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -1099,6 +1176,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -1117,6 +1197,11 @@ metadata: app.kubernetes.io/managed-by: Helm data: config.yaml: | + admin: + clientId: 'test-internal-client-id' + clientSecretLocation: /etc/secrets/union/client_secret + endpoint: '' + insecure: true authorizer: authorizerClient: forwardHeaders: @@ -1150,6 +1235,8 @@ data: connectPort: 8081 metrics: scope: 'usage:' + security: + singleTenantOrgID: '' selfServeConfig: legacyHosts: - '' @@ -1163,6 +1250,9 @@ data: - all tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' type: ClientSecret + connection: + insecure: false + insecureSkipVerify: true internalConnectionConfig: enabled: true urlPattern: _SERVICE_.union.svc.cluster.local:80 @@ -4880,138 +4970,6 @@ parameters: reclaimPolicy: Delete allowVolumeExpansion: true --- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - creationTimestamp: null - name: release-name-envoy-gateway-envoy-gateway-role -rules: -- apiGroups: - - "" - resources: - - nodes - - namespaces - verbs: - - get - - list - - watch -- apiGroups: - - gateway.networking.k8s.io - resources: - - gatewayclasses - verbs: - - get - - list - - patch - - update - - watch -- apiGroups: - - gateway.networking.k8s.io - resources: - - gatewayclasses/status - verbs: - - update -- apiGroups: - - multicluster.x-k8s.io - resources: - - serviceimports - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - configmaps - - secrets - - services - verbs: - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - daemonsets - verbs: - - get - - list - - watch -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - list - - watch -- apiGroups: - - gateway.envoyproxy.io - resources: - - envoyproxies - - envoypatchpolicies - - clienttrafficpolicies - - backendtrafficpolicies - - securitypolicies - - envoyextensionpolicies - - backends - - httproutefilters - verbs: - - get - - list - - watch -- apiGroups: - - gateway.envoyproxy.io - resources: - - envoypatchpolicies/status - - clienttrafficpolicies/status - - backendtrafficpolicies/status - - securitypolicies/status - - envoyextensionpolicies/status - - backends/status - verbs: - - update -- apiGroups: - - gateway.networking.k8s.io - resources: - - gateways - - grpcroutes - - httproutes - - referencegrants - - tcproutes - - tlsroutes - - udproutes - - backendtlspolicies - verbs: - - get - - list - - watch -- apiGroups: - - gateway.networking.k8s.io - resources: - - gateways/status - - grpcroutes/status - - httproutes/status - - tcproutes/status - - tlsroutes/status - - udproutes/status - - backendtlspolicies/status - verbs: - - update -- apiGroups: - - "" - resources: - - pods - - pods/binding - verbs: - - get - - list - - patch - - update - - watch ---- # Source: controlplane/charts/flyte/templates/admin/rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -5043,6 +5001,90 @@ rules: verbs: - '*' --- +# Source: controlplane/charts/ingress-nginx/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + name: controlplane-nginx +rules: + - apiGroups: + - "" + resources: + - configmaps + - endpoints + - nodes + - pods + - secrets + - namespaces + verbs: + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - apiGroups: + - networking.k8s.io + resources: + - ingresses/status + verbs: + - update + - apiGroups: + - networking.k8s.io + resources: + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch + - get +--- # Source: controlplane/charts/scylla-operator/templates/edit_clusterrole.yaml kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -5677,20 +5719,6 @@ rules: - list - watch --- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: release-name-envoy-gateway-envoy-gateway-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-envoy-gateway-envoy-gateway-role -subjects: -- kind: ServiceAccount - name: 'envoy-gateway' - namespace: 'union' ---- # Source: controlplane/charts/flyte/templates/admin/rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -5710,6 +5738,27 @@ subjects: name: flyteadmin namespace: union --- +# Source: controlplane/charts/ingress-nginx/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + name: controlplane-nginx +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: controlplane-nginx +subjects: + - kind: ServiceAccount + name: controlplane-nginx + namespace: union +--- # Source: controlplane/charts/scylla-operator/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -5724,109 +5773,99 @@ subjects: name: scylla-operator namespace: scylla-operator --- -# Source: controlplane/charts/envoy-gateway/templates/infra-manager-rbac.yaml +# Source: controlplane/charts/ingress-nginx/templates/controller-role.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: release-name-envoy-gateway-infra-manager - namespace: 'union' labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union rules: -- apiGroups: - - "" - resources: - - serviceaccounts - - services - - configmaps - verbs: - - create - - get - - list - - delete - - deletecollection - - patch -- apiGroups: - - apps - resources: - - deployments - - daemonsets - verbs: - - create - - get - - delete - - deletecollection - - patch -- apiGroups: - - autoscaling - - policy - resources: - - horizontalpodautoscalers - - poddisruptionbudgets - verbs: - - create - - get - - list - - delete - - deletecollection - - patch -- apiGroups: - - certificates.k8s.io - resources: - - clustertrustbundles - verbs: - - list - - get - - watch ---- -# Source: controlplane/charts/envoy-gateway/templates/leader-election-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: release-name-envoy-gateway-leader-election-role - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - apiGroups: + - "" + resources: + - configmaps + - pods + - secrets + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + # Omit Ingress status permissions if `--update-status` is disabled. + - apiGroups: + - networking.k8s.io + resources: + - ingresses/status + verbs: + - update + - apiGroups: + - networking.k8s.io + resources: + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + resourceNames: + - controlplane-nginx-leader + verbs: + - get + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch + - get --- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -5859,47 +5898,28 @@ rules: verbs: - '*' --- -# Source: controlplane/charts/envoy-gateway/templates/infra-manager-rbac.yaml +# Source: controlplane/charts/ingress-nginx/templates/controller-rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: release-name-envoy-gateway-infra-manager - namespace: 'union' labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: 'release-name-envoy-gateway-infra-manager' -subjects: -- kind: ServiceAccount - name: 'envoy-gateway' - namespace: 'union' ---- -# Source: controlplane/charts/envoy-gateway/templates/leader-election-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: release-name-envoy-gateway-leader-election-rolebinding - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: 'release-name-envoy-gateway-leader-election-role' + name: controlplane-nginx subjects: -- kind: ServiceAccount - name: 'envoy-gateway' - namespace: 'union' + - kind: ServiceAccount + name: controlplane-nginx + namespace: union --- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -5921,42 +5941,6 @@ subjects: name: flyteadmin namespace: union --- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-service.yaml -apiVersion: v1 -kind: Service -metadata: - name: envoy-gateway - namespace: 'union' - labels: - control-plane: envoy-gateway - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - selector: - control-plane: envoy-gateway - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - ports: - - name: grpc - port: 18000 - targetPort: 18000 - - name: ratelimit - port: 18001 - targetPort: 18001 - - name: wasm - port: 18002 - targetPort: 18002 - - name: metrics - port: 19001 - targetPort: 19001 - - name: webhook - port: 9443 - targetPort: 9443 ---- # Source: controlplane/charts/flyte/templates/admin/service.yaml apiVersion: v1 kind: Service @@ -6016,6 +6000,68 @@ spec: app.kubernetes.io/name: flyteconsole app.kubernetes.io/instance: release-name --- +# Source: controlplane/charts/ingress-nginx/templates/controller-service-metrics.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller-metrics + namespace: union +spec: + type: ClusterIP + ports: + - name: metrics + port: 10254 + protocol: TCP + targetPort: metrics + selector: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller +--- +# Source: controlplane/charts/ingress-nginx/templates/controller-service.yaml +apiVersion: v1 +kind: Service +metadata: + annotations: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +spec: + type: ClusterIP + ipFamilyPolicy: SingleStack + ipFamilies: + - IPv4 + ports: + - name: http + port: 80 + protocol: TCP + targetPort: http + appProtocol: http + - name: https + port: 443 + protocol: TCP + targetPort: https + appProtocol: https + selector: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller +--- # Source: controlplane/charts/scylla-operator/templates/webhookserver.service.yaml apiVersion: v1 kind: Service @@ -6349,110 +6395,6 @@ spec: app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name --- -# Source: controlplane/charts/envoy-gateway/templates/envoy-gateway-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: envoy-gateway - namespace: 'union' - labels: - control-plane: envoy-gateway - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -spec: - replicas: 1 - selector: - matchLabels: - control-plane: envoy-gateway - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - prometheus.io/port: "19001" - prometheus.io/scrape: "true" - labels: - control-plane: envoy-gateway - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - spec: - containers: - - args: - - server - - --config-path=/config/envoy-gateway.yaml - env: - - name: ENVOY_GATEWAY_NAMESPACE - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - - name: KUBERNETES_CLUSTER_DOMAIN - value: cluster.local - image: docker.io/envoyproxy/gateway:v1.6.4 - imagePullPolicy: IfNotPresent - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - name: envoy-gateway - ports: - - containerPort: 18000 - name: grpc - - containerPort: 18001 - name: ratelimit - - containerPort: 18002 - name: wasm - - containerPort: 19001 - name: metrics - - name: webhook - containerPort: 9443 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - memory: 1024Mi - requests: - cpu: 100m - memory: 256Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - privileged: false - runAsGroup: 65532 - runAsNonRoot: true - runAsUser: 65532 - seccompProfile: - type: RuntimeDefault - volumeMounts: - - mountPath: /config - name: envoy-gateway-config - readOnly: true - - mountPath: /certs - name: certs - readOnly: true - imagePullSecrets: [] - serviceAccountName: envoy-gateway - terminationGracePeriodSeconds: 10 - volumes: - - configMap: - defaultMode: 420 - name: envoy-gateway-config - name: envoy-gateway-config - - name: certs - secret: - secretName: envoy-gateway ---- # Source: controlplane/charts/flyte/templates/admin/deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -6477,7 +6419,7 @@ spec: template: metadata: annotations: - configChecksum: "676ab3d9a1d5a3d13b441cc308a756dfab35fc748311e7b336490809728d692" + configChecksum: "b03b2e79a2b21bbfee529edc64a35bd90243ef79845c3299905bec8bfa6dce4" kubectl.kubernetes.io/default-container: flyteadmin labels: app.kubernetes.io/name: flyteadmin @@ -6499,7 +6441,7 @@ spec: - /etc/flyte/config/*.yaml - migrate - run - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" name: run-migrations securityContext: @@ -6519,7 +6461,7 @@ spec: - seed-projects - union-health-monitoring - flytesnacks - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" name: seed-projects securityContext: @@ -6532,7 +6474,7 @@ spec: - mountPath: /etc/flyte/config name: base-config-volume - name: generate-secrets - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" command: ["/bin/sh", "-c"] args: @@ -6559,7 +6501,7 @@ spec: - --config - /etc/flyte/config/*.yaml - serve - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" name: flyteadmin ports: @@ -6666,6 +6608,8 @@ spec: helm.sh/chart: flyte-v1.16.1 app.kubernetes.io/managed-by: Helm spec: + imagePullSecrets: + - name: union-registry-secret securityContext: fsGroupChangePolicy: OnRootMismatch runAsNonRoot: true @@ -6673,7 +6617,7 @@ spec: seLinuxOptions: type: spc_t containers: - - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/flyteconsole:" + - image: "registry.unionai.cloud/controlplane/flyteconsole:" imagePullPolicy: "IfNotPresent" name: flyteconsole envFrom: @@ -6706,6 +6650,122 @@ spec: - emptyDir: {} name: shared-data --- +# Source: controlplane/charts/ingress-nginx/templates/controller-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane-nginx-controller + namespace: union +spec: + selector: + matchLabels: + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: controller + replicas: 1 + revisionHistoryLimit: 10 + minReadySeconds: 0 + template: + metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + spec: + dnsPolicy: ClusterFirst + containers: + - name: controller + image: registry.k8s.io/ingress-nginx/controller:v1.12.3@sha256:ac444cd9515af325ba577b596fe4f27a34be1aa330538e8b317ad9d6c8fb94ee + imagePullPolicy: IfNotPresent + lifecycle: + preStop: + exec: + command: + - /wait-shutdown + args: + - /nginx-ingress-controller + - --publish-service=$(POD_NAMESPACE)/controlplane-nginx-controller + - --election-id=controlplane-nginx-leader + - --controller-class=union.ai/controlplane + - --ingress-class=nginx + - --configmap=$(POD_NAMESPACE)/controlplane-nginx-controller + - --enable-metrics=true + - --default-ssl-certificate=/ + securityContext: + runAsNonRoot: true + runAsUser: 101 + runAsGroup: 82 + allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault + capabilities: + drop: + - ALL + add: + - NET_BIND_SERVICE + readOnlyRootFilesystem: false + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: LD_PRELOAD + value: /usr/local/lib/libmimalloc.so + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + ports: + - name: http + containerPort: 80 + protocol: TCP + - name: https + containerPort: 443 + protocol: TCP + - name: metrics + containerPort: 10254 + protocol: TCP + resources: + requests: + cpu: 100m + memory: 90Mi + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: controlplane-nginx + terminationGracePeriodSeconds: 300 +--- # Source: controlplane/charts/scylla-operator/templates/operator.deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -6882,7 +6942,7 @@ spec: - /etc/cacheservice/config/*.yaml - migrate - run - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" name: run-migrations volumeMounts: @@ -6900,7 +6960,7 @@ spec: - --config - /etc/cacheservice/config/*.yaml - serve - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:" + image: "registry.unionai.cloud/controlplane/services:" imagePullPolicy: "IfNotPresent" name: cacheservice ports: @@ -6984,6 +7044,8 @@ spec: app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: unionconsole securityContext: fsGroupChangePolicy: OnRootMismatch @@ -6998,7 +7060,7 @@ spec: capabilities: drop: - ALL - image: "643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/unionconsole:2026.4.5" + image: "registry.unionai.cloud/controlplane/unionconsole:2026.4.5" imagePullPolicy: IfNotPresent ports: - name: http @@ -7007,6 +7069,9 @@ spec: - name: http-metrics containerPort: 8081 protocol: TCP + env: + - name: UNION_ORG_OVERRIDE + value: '' resources: limits: cpu: 500m @@ -7048,6 +7113,8 @@ spec: app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: authorizer volumes: - name: secrets @@ -7161,6 +7228,8 @@ spec: app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: cluster volumes: - name: secrets @@ -7290,6 +7359,8 @@ spec: app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: dataproxy volumes: - name: secrets @@ -7400,6 +7471,8 @@ spec: app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: executions volumes: - name: secrets @@ -7527,6 +7600,8 @@ spec: app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: queue volumes: - name: secrets @@ -7653,6 +7728,8 @@ spec: app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: run-scheduler volumes: - name: secrets @@ -7780,6 +7857,8 @@ spec: app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name spec: + imagePullSecrets: + - name: union-registry-secret serviceAccountName: usage volumes: - name: secrets @@ -8084,6 +8163,22 @@ spec: type: Utilization averageUtilization: 80 --- +# Source: controlplane/charts/ingress-nginx/templates/controller-ingressclass.yaml +apiVersion: networking.k8s.io/v1 +kind: IngressClass +metadata: + labels: + helm.sh/chart: ingress-nginx-4.12.3 + app.kubernetes.io/name: ingress-nginx + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.12.3" + app.kubernetes.io/part-of: ingress-nginx + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: controller + name: controlplane +spec: + controller: union.ai/controlplane +--- # Source: controlplane/templates/flyte-core-app.yaml apiVersion: networking.k8s.io/v1 kind: Ingress @@ -8115,7 +8210,7 @@ metadata: large_client_header_buffers 64 32k; nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri nginx.ingress.kubernetes.io/auth-url: https://$host/me nginx.org/websocket-services: dataproxy-service @@ -8176,7 +8271,7 @@ metadata: nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/backend-protocol: GRPC nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me nginx.ingress.kubernetes.io/use-regex: "true" spec: @@ -8228,7 +8323,7 @@ metadata: large_client_header_buffers 64 32k; nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri nginx.ingress.kubernetes.io/auth-url: https://$host/me nginx.org/websocket-services: dataproxy-service @@ -8282,7 +8377,7 @@ metadata: large_client_header_buffers 64 32k; nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri nginx.ingress.kubernetes.io/auth-url: https://$host/me nginx.org/websocket-services: dataproxy-service @@ -8700,7 +8795,7 @@ metadata: nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/backend-protocol: GRPC nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me spec: ingressClassName: "controlplane" @@ -9439,7 +9534,7 @@ metadata: nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/backend-protocol: GRPC nginx.ingress.kubernetes.io/auth-cache-key: $http_authorization$http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-url: http://flyteadmin.union.svc.cluster.local/me spec: ingressClassName: "controlplane" @@ -10134,7 +10229,7 @@ metadata: large_client_header_buffers 64 32k; nginx.ingress.kubernetes.io/service-upstream: "true" nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token + nginx.ingress.kubernetes.io/auth-response-headers: Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username nginx.ingress.kubernetes.io/auth-signin: https://$host/login?redirect_url=$escaped_request_uri nginx.ingress.kubernetes.io/auth-url: https://$host/me nginx.org/websocket-services: dataproxy-service @@ -10284,6 +10379,10 @@ webhooks: - scylladbmanagerclusterregistrations - scylladbmanagertasks --- +# Source: controlplane/charts/ingress-nginx/templates/controller-poddisruptionbudget.yaml +# PDB is not supported for DaemonSets. +# https://github.com/kubernetes/kubernetes/issues/108124 +--- # Source: controlplane/templates/secret.yaml --- --- @@ -10355,217 +10454,3 @@ spec: storage: capacity: 100Gi storageClassName: scylladb ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: release-name-envoy-gateway-certgen - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: 'release-name-envoy-gateway-certgen:union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. -rules: - - apiGroups: - - admissionregistration.k8s.io - resources: - - mutatingwebhookconfigurations - verbs: - - get - - list - - watch - - apiGroups: - - admissionregistration.k8s.io - resources: - - mutatingwebhookconfigurations - resourceNames: - - 'envoy-gateway-topology-injector.union' - verbs: - - update - - patch ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: 'release-name-envoy-gateway-certgen:union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: 'release-name-envoy-gateway-certgen:union' -subjects: - - kind: ServiceAccount - name: 'release-name-envoy-gateway-certgen' - namespace: 'union' ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: release-name-envoy-gateway-certgen - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. -rules: -- apiGroups: - - "" - resources: - - secrets - verbs: - - get - - create - - update ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen-rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: release-name-envoy-gateway-certgen - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" # Ensure rbac is created before the certgen job when using ArgoCD. -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: 'release-name-envoy-gateway-certgen' -subjects: -- kind: ServiceAccount - name: 'release-name-envoy-gateway-certgen' - namespace: 'union' ---- -# Source: controlplane/charts/envoy-gateway/templates/certgen.yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: release-name-envoy-gateway-certgen - namespace: 'union' - labels: - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": pre-install, pre-upgrade -spec: - backoffLimit: 1 - completions: 1 - parallelism: 1 - template: - metadata: - labels: - app: certgen - spec: - containers: - - command: - - envoy-gateway - - certgen - env: - - name: ENVOY_GATEWAY_NAMESPACE - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - - name: KUBERNETES_CLUSTER_DOMAIN - value: cluster.local - image: docker.io/envoyproxy/gateway:v1.6.4 - imagePullPolicy: IfNotPresent - name: envoy-gateway-certgen - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - privileged: false - readOnlyRootFilesystem: true - runAsGroup: 65532 - runAsNonRoot: true - runAsUser: 65532 - seccompProfile: - type: RuntimeDefault - imagePullSecrets: [] - restartPolicy: Never - serviceAccountName: release-name-envoy-gateway-certgen - ttlSecondsAfterFinished: 30 ---- -# Source: controlplane/charts/envoy-gateway/templates/envoy-proxy-topology-injector-webhook.yaml -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: 'envoy-gateway-topology-injector.union' - annotations: - "helm.sh/hook": pre-install, pre-upgrade - "helm.sh/hook-weight": "-1" - labels: - app.kubernetes.io/component: topology-injector - helm.sh/chart: envoy-gateway-v1.6.4 - app.kubernetes.io/name: envoy-gateway - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "v1.6.4" - app.kubernetes.io/managed-by: Helm -webhooks: - - name: topology.webhook.gateway.envoyproxy.io - admissionReviewVersions: ["v1"] - sideEffects: None - clientConfig: - service: - name: envoy-gateway - namespace: 'union' - path: "/inject-pod-topology" - port: 9443 - failurePolicy: Ignore - rules: - - operations: ["CREATE"] - apiGroups: [""] - apiVersions: ["v1"] - resources: ["pods/binding"] - namespaceSelector: - matchExpressions: - - key: kubernetes.io/metadata.name - operator: In - values: - - union diff --git a/tests/values/controlplane.custom-oidc.yaml b/tests/values/controlplane.custom-oidc.yaml index 300ba1b9..34d42a76 100644 --- a/tests/values/controlplane.custom-oidc.yaml +++ b/tests/values/controlplane.custom-oidc.yaml @@ -1,3 +1,4 @@ +# helm-values: values.aws.selfhosted-intracluster.yaml # Test fixture: Custom OIDC provider configuration. # Exercises the OAuth2 globals for non-Okta IdPs (e.g. Entra ID, Keycloak). # All values use generic, non-internal names. From 6b4279892f40a238f37ea2148cf3c041cdf9cbbc Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 07:27:32 +1000 Subject: [PATCH 09/23] Document OIDC auth config with OAuth app numbers and flow references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each config section now documents: - Which OAuth app (1-5) it maps to - Which authentication flow it's used in (browser, CLI/SDK, service-to-service) - How it relates to the globals in Section 1 Cross-references the five OAuth apps from the authentication architecture: App 1: Browser (confidential) — userAuth.openId App 2: CLI (public, PKCE) — thirdPartyConfig.flyteClient App 3: Internal S2S — INTERNAL_CLIENT_ID global Apps 4,5: Operator, EAGER — dataplane values Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 54 ++++++++++++++++++- .../values.gcp.selfhosted-intracluster.yaml | 54 ++++++++++++++++++- 2 files changed, 104 insertions(+), 4 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index cbae4cbb..09160558 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -321,33 +321,83 @@ flyte: singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' # --- OIDC Authentication --- - # To enable authentication, set server.security.useAuth: true - # and configure the auth globals in Section 1 above. + # Flyteadmin acts as both the OAuth2 resource server (validates access tokens) + # and the OIDC relying party (browser login flow). Configure the globals in + # Section 1 above, then enable auth: + # # server: # security: # useAuth: true + # + # The sections below map to different authentication flows: + # + # auth.appAuth.externalAuthServer + # Resource server config — validates access tokens from ALL flows. + # Uses OIDC_BASE_URL for JWKS discovery and token validation. + # Uses: OAuth Apps 1-5 (all tokens are validated here). + # Flows: Browser login, CLI/SDK PKCE, service-to-service. + # + # auth.appAuth.thirdPartyConfig.flyteClient + # CLI/SDK PKCE client config — returned by GetPublicClientConfig RPC. + # The SDK/CLI uses this to initiate the PKCE authorization flow. + # Uses: OAuth App 2 (CLI — public client). + # Flow: CLI/SDK PKCE (Flow 2). + # + # auth.userAuth.openId + # Browser login config — OIDC redirect flow for web console. + # Uses: OAuth App 1 (Browser — confidential client). + # Flow: Browser login (Flow 1). + # + # Service-to-service auth (OAuth Apps 3-5) is configured separately: + # - INTERNAL_CLIENT_ID + AUTH_TOKEN_URL in globals (App 3) + # - Operator and EAGER credentials in dataplane values (Apps 4, 5) auth: + # Custom authorization header name. All services use this instead of + # the standard "authorization" header to avoid conflicts with service + # meshes (e.g. Linkerd, Istio) that intercept the default header. httpAuthorizationHeader: "flyte-authorization" grpcAuthorizationHeader: "flyte-authorization" + # URIs that flyteadmin accepts as valid audiences in its own tokens. authorizedUris: - "http://flyteadmin:80" - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' + + # --- Resource Server (validates access tokens from all flows) --- appAuth: authServerType: "External" externalAuthServer: + # OIDC issuer for JWKS discovery and token validation. baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + # Metadata discovery endpoint (see OIDC_METADATA_URL global). metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' + # allowedAudience is set by Terraform (list type, can't use a single global). + # Default: ["https://{domain}"]. Override via OIDC_ALLOWED_AUDIENCE global + # or Terraform authn module output. + + # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- + # Returned by the GetPublicClientConfig RPC. The SDK reads this to + # know which client ID, scopes, and audience to use for PKCE auth. thirdPartyConfig: flyteClient: + # OAuth App 2: CLI (public client, PKCE flow) clientId: '{{ .Values.global.CLI_CLIENT_ID }}' redirectUri: "http://localhost:53593/callback" + # Resource scope — determines the audience of the access token. + # See OIDC_APP_SCOPE global. scopes: - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' + # Audience parameter for the authorization request. + # See OIDC_APP_AUDIENCE global. audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' + + # --- Browser Login (Flow 1: OIDC redirect for web console) --- userAuth: openId: + # OAuth App 1: Browser (confidential client, authorization_code grant) baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' + # Scopes requested during browser OIDC login. + # Terraform appends OIDC_APP_SCOPE here when set (for Entra ID). scopes: ["profile", "openid", "offline_access"] cookieSetting: sameSitePolicy: "LaxMode" diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index 70a9b67a..a80df4a4 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -351,33 +351,83 @@ flyte: singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' # --- OIDC Authentication --- - # To enable authentication, set server.security.useAuth: true - # and configure the auth globals in Section 1 above. + # Flyteadmin acts as both the OAuth2 resource server (validates access tokens) + # and the OIDC relying party (browser login flow). Configure the globals in + # Section 1 above, then enable auth: + # # server: # security: # useAuth: true + # + # The sections below map to different authentication flows: + # + # auth.appAuth.externalAuthServer + # Resource server config — validates access tokens from ALL flows. + # Uses OIDC_BASE_URL for JWKS discovery and token validation. + # Uses: OAuth Apps 1-5 (all tokens are validated here). + # Flows: Browser login, CLI/SDK PKCE, service-to-service. + # + # auth.appAuth.thirdPartyConfig.flyteClient + # CLI/SDK PKCE client config — returned by GetPublicClientConfig RPC. + # The SDK/CLI uses this to initiate the PKCE authorization flow. + # Uses: OAuth App 2 (CLI — public client). + # Flow: CLI/SDK PKCE (Flow 2). + # + # auth.userAuth.openId + # Browser login config — OIDC redirect flow for web console. + # Uses: OAuth App 1 (Browser — confidential client). + # Flow: Browser login (Flow 1). + # + # Service-to-service auth (OAuth Apps 3-5) is configured separately: + # - INTERNAL_CLIENT_ID + AUTH_TOKEN_URL in globals (App 3) + # - Operator and EAGER credentials in dataplane values (Apps 4, 5) auth: + # Custom authorization header name. All services use this instead of + # the standard "authorization" header to avoid conflicts with service + # meshes (e.g. Linkerd, Istio) that intercept the default header. httpAuthorizationHeader: "flyte-authorization" grpcAuthorizationHeader: "flyte-authorization" + # URIs that flyteadmin accepts as valid audiences in its own tokens. authorizedUris: - "http://flyteadmin:80" - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' + + # --- Resource Server (validates access tokens from all flows) --- appAuth: authServerType: "External" externalAuthServer: + # OIDC issuer for JWKS discovery and token validation. baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + # Metadata discovery endpoint (see OIDC_METADATA_URL global). metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' + # allowedAudience is set by Terraform (list type, can't use a single global). + # Default: ["https://{domain}"]. Override via OIDC_ALLOWED_AUDIENCE global + # or Terraform authn module output. + + # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- + # Returned by the GetPublicClientConfig RPC. The SDK reads this to + # know which client ID, scopes, and audience to use for PKCE auth. thirdPartyConfig: flyteClient: + # OAuth App 2: CLI (public client, PKCE flow) clientId: '{{ .Values.global.CLI_CLIENT_ID }}' redirectUri: "http://localhost:53593/callback" + # Resource scope — determines the audience of the access token. + # See OIDC_APP_SCOPE global. scopes: - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' + # Audience parameter for the authorization request. + # See OIDC_APP_AUDIENCE global. audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' + + # --- Browser Login (Flow 1: OIDC redirect for web console) --- userAuth: openId: + # OAuth App 1: Browser (confidential client, authorization_code grant) baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' + # Scopes requested during browser OIDC login. + # Terraform appends OIDC_APP_SCOPE here when set (for Entra ID). scopes: ["profile", "openid", "offline_access"] cookieSetting: sameSitePolicy: "LaxMode" From eacf7ff9fcf2f137b672fb2200792118fe9caed0 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 07:31:14 +1000 Subject: [PATCH 10/23] Remove Terraform references from OIDC auth config comments Values files should not assume specific deployment tooling. Replaced "Terraform-generated values" with "environment-specific values overlay" and "Terraform authn module output" with "values overlay". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 11 ++++++----- .../values.gcp.selfhosted-intracluster.yaml | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 09160558..226db1b0 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -172,7 +172,7 @@ global: OIDC_APP_AUDIENCE: "" # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. - # Set them in your environment-specific overlay (Terraform-generated values). + # Set them in your environment-specific values overlay. # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. @@ -370,9 +370,10 @@ flyte: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' # Metadata discovery endpoint (see OIDC_METADATA_URL global). metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' - # allowedAudience is set by Terraform (list type, can't use a single global). - # Default: ["https://{domain}"]. Override via OIDC_ALLOWED_AUDIENCE global - # or Terraform authn module output. + # allowedAudience: list of accepted JWT audiences for access token validation. + # Set in your environment-specific values overlay (list type, can't use a + # single global). Default: ["https://{domain}"]. + # Override via OIDC_ALLOWED_AUDIENCE or in your values overlay. # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- # Returned by the GetPublicClientConfig RPC. The SDK reads this to @@ -397,7 +398,7 @@ flyte: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' # Scopes requested during browser OIDC login. - # Terraform appends OIDC_APP_SCOPE here when set (for Entra ID). + # For Entra ID, append OIDC_APP_SCOPE here in your values overlay. scopes: ["profile", "openid", "offline_access"] cookieSetting: sameSitePolicy: "LaxMode" diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index a80df4a4..8ab4c067 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -180,7 +180,7 @@ global: OIDC_APP_AUDIENCE: "" # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. - # Set them in your environment-specific overlay (Terraform-generated values). + # Set them in your environment-specific values overlay. # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. @@ -400,9 +400,10 @@ flyte: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' # Metadata discovery endpoint (see OIDC_METADATA_URL global). metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' - # allowedAudience is set by Terraform (list type, can't use a single global). - # Default: ["https://{domain}"]. Override via OIDC_ALLOWED_AUDIENCE global - # or Terraform authn module output. + # allowedAudience: list of accepted JWT audiences for access token validation. + # Set in your environment-specific values overlay (list type, can't use a + # single global). Default: ["https://{domain}"]. + # Override via OIDC_ALLOWED_AUDIENCE or in your values overlay. # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- # Returned by the GetPublicClientConfig RPC. The SDK reads this to @@ -427,7 +428,7 @@ flyte: baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' # Scopes requested during browser OIDC login. - # Terraform appends OIDC_APP_SCOPE here when set (for Entra ID). + # For Entra ID, append OIDC_APP_SCOPE here in your values overlay. scopes: ["profile", "openid", "offline_access"] cookieSetting: sameSitePolicy: "LaxMode" From df9806a1e7e795edf5d19ecc3473d498d7c22c89 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 15:46:07 +1000 Subject: [PATCH 11/23] Add OIDC_S2S_SCOPE global for service-to-service client_credentials Entra ID requires scope "api://{app}/.default" for client_credentials grants. The default "all" causes AADSTS1002012 invalid_scope. New global OIDC_S2S_SCOPE: used by internal service auth config. Default: "all" (Okta). Entra ID: "api://my-app/.default". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 10 +++++++++- .../values.gcp.selfhosted-intracluster.yaml | 10 +++++++++- tests/generated/controlplane.custom-oidc.yaml | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 226db1b0..10e90fb9 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -171,6 +171,13 @@ global: # Entra ID example: "api://my-app-name" OIDC_APP_AUDIENCE: "" + # OAuth2 scope for service-to-service authentication (client_credentials grant). + # Used by internal controlplane services (App 3) and dataplane operator (App 4). + # Okta: leave empty (defaults to "all", configured on the auth server). + # Entra ID example: "api://my-app-name/.default" + # Entra requires /.default suffix for client_credentials grants. + OIDC_S2S_SCOPE: "" + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. # Set them in your environment-specific values overlay. # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). @@ -604,7 +611,8 @@ services: clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' clientSecretLocation: "/etc/secrets/union/client_secret" tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: ["all"] + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' # ---------------------------------------------------------------------------- # Monitoring Configuration (AWS/EKS specific) diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index 8ab4c067..f0060a6e 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -179,6 +179,13 @@ global: # Entra ID example: "api://my-app-name" OIDC_APP_AUDIENCE: "" + # OAuth2 scope for service-to-service authentication (client_credentials grant). + # Used by internal controlplane services (App 3) and dataplane operator (App 4). + # Okta: leave empty (defaults to "all", configured on the auth server). + # Entra ID example: "api://my-app-name/.default" + # Entra requires /.default suffix for client_credentials grants. + OIDC_S2S_SCOPE: "" + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. # Set them in your environment-specific values overlay. # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). @@ -618,7 +625,8 @@ services: clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' clientSecretLocation: "/etc/secrets/union/client_secret" tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: ["all"] + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' # ---------------------------------------------------------------------------- # SECTION 9: ScyllaDB Configuration diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml index bb61fd1f..0b872722 100644 --- a/tests/generated/controlplane.custom-oidc.yaml +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -981,7 +981,7 @@ data: endpoint: '' insecure: true scopes: - - all + - 'all' tokenUrl: 'https://idp.example.com/oauth2/v2.0/token' apps: enrichIdentities: false From b45a4fb35a0e7f4306b7904be460b385b009ef7b Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 15:58:28 +1000 Subject: [PATCH 12/23] Regenerate snapshots after rebase onto main Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/generated/controlplane.custom-oidc.yaml | 178 +++++++++--------- tests/generated/controlplane.userclouds.yaml | 36 ++-- 2 files changed, 108 insertions(+), 106 deletions(-) diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml index 0b872722..0ef0c61f 100644 --- a/tests/generated/controlplane.custom-oidc.yaml +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -37,10 +37,10 @@ kind: PodDisruptionBudget metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: minAvailable: "33%" @@ -231,7 +231,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/managed-by: Helm annotations: eks.amazonaws.com/role-arn: '' @@ -244,10 +244,10 @@ kind: ServiceAccount metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -257,10 +257,10 @@ kind: ServiceAccount metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -269,10 +269,10 @@ kind: ServiceAccount metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -281,10 +281,10 @@ kind: ServiceAccount metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -293,10 +293,10 @@ kind: ServiceAccount metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -305,10 +305,10 @@ kind: ServiceAccount metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -317,10 +317,10 @@ kind: ServiceAccount metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/serviceaccount.yaml @@ -329,10 +329,10 @@ kind: ServiceAccount metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/union-serviceaccount.yaml @@ -342,10 +342,10 @@ metadata: name: union namespace: union labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/charts/flyte/templates/admin/secret.yaml @@ -605,7 +605,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/managed-by: Helm data: db.yaml: | @@ -676,10 +676,10 @@ kind: ConfigMap metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -769,10 +769,10 @@ kind: ConfigMap metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -854,10 +854,10 @@ kind: ConfigMap metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -927,10 +927,10 @@ kind: ConfigMap metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -989,6 +989,8 @@ data: llm: enabled: false task: + clusterCacheConfig: + ttl: 10m enabled: true enrichIdentities: false logger: @@ -1032,10 +1034,10 @@ kind: ConfigMap metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -1111,10 +1113,10 @@ kind: ConfigMap metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -1190,10 +1192,10 @@ kind: ConfigMap metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -5876,7 +5878,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 #app.kubernetes.io/managed-by: Helm rules: - apiGroups: @@ -5930,7 +5932,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 #app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -6090,7 +6092,7 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6117,10 +6119,10 @@ metadata: name: unionconsole labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6145,10 +6147,10 @@ metadata: name: authorizer labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6184,10 +6186,10 @@ metadata: name: cluster labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6223,10 +6225,10 @@ metadata: name: dataproxy labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6258,10 +6260,10 @@ metadata: name: executions labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6293,10 +6295,10 @@ metadata: name: queue labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6328,10 +6330,10 @@ metadata: name: run-scheduler labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6363,10 +6365,10 @@ metadata: name: usage labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6905,7 +6907,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -6916,7 +6918,7 @@ spec: template: metadata: annotations: - configChecksum: "9c88958e2c6c93925c335fa455ed035dc6c32e92bead61445ca6463e40c19cc" + configChecksum: "197f4097faabcce1c83bbd79953460800854d6b92837333f3dd60c6c1bfa14a" linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -6925,7 +6927,7 @@ spec: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/managed-by: Helm spec: securityContext: @@ -7017,10 +7019,10 @@ kind: Deployment metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: strategy: @@ -7060,7 +7062,7 @@ spec: capabilities: drop: - ALL - image: "registry.unionai.cloud/controlplane/unionconsole:2026.4.5" + image: "registry.unionai.cloud/controlplane/unionconsole:2026.4.7" imagePullPolicy: IfNotPresent ports: - name: http @@ -7087,10 +7089,10 @@ kind: Deployment metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7128,7 +7130,7 @@ spec: name: authorizer containers: - name: authorizer - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - authorizer @@ -7202,10 +7204,10 @@ kind: Deployment metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7243,7 +7245,7 @@ spec: name: cluster initContainers: - name: cluster-migrate - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudcluster @@ -7259,7 +7261,7 @@ spec: mountPath: /etc/config/ containers: - name: cluster - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudcluster @@ -7333,10 +7335,10 @@ kind: Deployment metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7374,7 +7376,7 @@ spec: name: dataproxy containers: - name: dataproxy - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - dataproxy @@ -7445,10 +7447,10 @@ kind: Deployment metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7486,7 +7488,7 @@ spec: name: executions initContainers: - name: executions-migrate - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudpropeller @@ -7502,7 +7504,7 @@ spec: mountPath: /etc/config/ containers: - name: executions - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudpropeller @@ -7573,10 +7575,10 @@ kind: Deployment metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -7615,7 +7617,7 @@ spec: name: queue initContainers: - name: queue-migrate - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - queue @@ -7631,7 +7633,7 @@ spec: mountPath: /etc/config/ containers: - name: queue - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - queue @@ -7702,10 +7704,10 @@ kind: Deployment metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7743,7 +7745,7 @@ spec: name: run-scheduler initContainers: - name: run-scheduler-migrate - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudpropeller @@ -7759,7 +7761,7 @@ spec: mountPath: /etc/config/ containers: - name: run-scheduler - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - cloudpropeller @@ -7831,10 +7833,10 @@ kind: Deployment metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: selector: @@ -7872,7 +7874,7 @@ spec: name: usage containers: - name: usage - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent args: - usage @@ -7979,10 +7981,10 @@ kind: HorizontalPodAutoscaler metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: scaleTargetRef: diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index 60dad826..473cbef1 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -37,10 +37,10 @@ kind: PodDisruptionBudget metadata: name: release-name-union-authz labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: minAvailable: 2 @@ -241,10 +241,10 @@ kind: ServiceAccount metadata: name: release-name-union-authz labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/cacheservice/rbac.yaml @@ -622,10 +622,10 @@ kind: ConfigMap metadata: name: release-name-union-authz-config labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm data: config.yaml: | @@ -5911,10 +5911,10 @@ kind: Role metadata: name: release-name-union-authz-secrets-manager labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm rules: - apiGroups: [""] @@ -6000,10 +6000,10 @@ kind: RoleBinding metadata: name: release-name-union-authz-secrets-manager labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -6153,10 +6153,10 @@ kind: Service metadata: name: release-name-union-authz labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -6977,10 +6977,10 @@ kind: Deployment metadata: name: release-name-union-authz labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: strategy: @@ -6995,7 +6995,7 @@ spec: template: metadata: annotations: - checksum/config: 143023ded44f2db18ddf79507adcbb11c31eb4da967c39179d54bd01bdb07f5c + checksum/config: 8bda5502d8cb82e6d35a0d7495c605eba4ea4137a8294c0149d7b60dafb1d458 linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -7019,7 +7019,7 @@ spec: drop: - ALL readOnlyRootFilesystem: true - image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.5 + image: 643379628101.dkr.ecr.us-east-1.amazonaws.com/union-cp/services:2026.4.7 imagePullPolicy: IfNotPresent command: - userclouds-lite @@ -8143,10 +8143,10 @@ kind: HorizontalPodAutoscaler metadata: name: release-name-union-authz labels: - helm.sh/chart: controlplane-2026.4.5 + helm.sh/chart: controlplane-2026.4.7 app.kubernetes.io/name: union-authz app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/version: "2026.4.7" app.kubernetes.io/managed-by: Helm spec: scaleTargetRef: From e9889b49c3befc5d5ec7af39f918ad758c97c2f0 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 16:21:19 +1000 Subject: [PATCH 13/23] Add OIDC_S2S_SCOPE to dataplane auth config Dataplane services (operator, proxy, executor) use client_credentials to authenticate with the control plane. Entra ID requires /.default scope for this grant type. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/charts/dataplane/values.aws.selfhosted-intracluster.yaml b/charts/dataplane/values.aws.selfhosted-intracluster.yaml index b2a6654d..58c62dad 100644 --- a/charts/dataplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/dataplane/values.aws.selfhosted-intracluster.yaml @@ -109,9 +109,14 @@ global: # Required when the control plane has OIDC enabled. # Supports any OAuth2/OIDC provider (Okta, Azure AD, Auth0, Keycloak, etc.) # - # Service-to-service OAuth client ID (client_credentials flow) + # Service-to-service OAuth client ID (client_credentials flow). + # Uses: OAuth App 4 (Operator). # Example: "0oa3xyz4abc5def6g7h8" AUTH_CLIENT_ID: "" + # OAuth2 scope for service-to-service client_credentials grant. + # Okta: leave empty (defaults to "all"). + # Entra ID: "api://my-app-name/.default" + OIDC_S2S_SCOPE: "" # ---------------------------------------------------------------------------- # SECTION 2: Core Identity Configuration (REQUIRED) @@ -181,6 +186,9 @@ clusterresourcesync: # --- Service-to-service OAuth2 --- # ClusterResourceSync acquires OAuth2 tokens via client_credentials # flow and sends them on outgoing calls to the control plane. + # OAuth2 auth for dataplane → controlplane service-to-service calls. + # Uses: OAuth App 4 (Operator — confidential client, client_credentials grant). + # Flow: Service-to-service (Flow 3). auth: enable: true type: "ClientSecret" @@ -188,6 +196,8 @@ clusterresourcesync: clientSecretLocation: "/etc/union/secret/client_secret" authorizationMetadataKey: "flyte-authorization" tokenRefreshWindow: "5m" + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' # ---------------------------------------------------------------------------- # Core Service Configuration From b03e470fe67d957dcb2afa4b156812ad6e472583 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 21:14:31 +1000 Subject: [PATCH 14/23] Add server-alias for intra-cluster auth on selfhosted ingress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In selfhosted-intracluster mode, DP services connect to the CP nginx controller via internal K8s DNS. Without server-alias, the :authority header doesn't match the ingress host and auth subrequests are bypassed. This ensures all DP→CP traffic goes through nginx auth regardless of whether it arrives via internal DNS or the external domain. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 12 ++++++++++++ .../values.gcp.selfhosted-intracluster.yaml | 9 ++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 10e90fb9..c490b5fd 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -478,10 +478,22 @@ ingress: - "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" secretName: "{{ .Values.global.TLS_SECRET_NAME }}" + # --- Ingress Annotations (shared across all ingress objects) --- + annotations: + # Allow the nginx controller's internal DNS to match ingress rules so that + # intra-cluster traffic (DP → CP via nginx service DNS) is routed through + # the same auth subrequest as external traffic. Without this, the :authority + # header won't match the ingress host and auth is bypassed. + nginx.ingress.kubernetes.io/server-alias: "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" + # --- Protected Ingress Auth Annotations --- # These configure nginx to validate requests via flyteadmin's /me endpoint # and redirect unauthenticated users to /login for the OIDC flow. # Active when OIDC authentication is enabled (server.security.useAuth: true). + # + # All protected endpoints use "https://$host/me" so the auth subrequest goes + # through nginx itself. This ensures verifyClaims runs on the access token, + # which resolves identitytype for all callers (browser, CLI, service-to-service). protectedIngressAnnotations: nginx.ingress.kubernetes.io/auth-url: "https://$host/me" nginx.ingress.kubernetes.io/auth-signin: "https://$host/login?redirect_url=$escaped_request_uri" diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index f0060a6e..2e7274f9 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -508,7 +508,14 @@ ingress: - "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" secretName: "{{ .Values.global.TLS_SECRET_NAME }}" - # Protected ingress auth annotations are now defined in the base values.yaml. + # --- Ingress Annotations (shared across all ingress objects) --- + annotations: + # Allow the nginx controller's internal DNS to match ingress rules so that + # intra-cluster traffic (DP → CP via nginx service DNS) is routed through + # the same auth subrequest as external traffic. + nginx.ingress.kubernetes.io/server-alias: "{{ .Values.global.CONTROLPLANE_INTRA_CLUSTER_HOST }}" + + # Protected ingress auth annotations are defined in the base values.yaml. # Override here only if you need to customize auth behavior for this deployment mode. # ---------------------------------------------------------------------------- From cc9717f32a7d44f3dadffcca63c314965021d9b6 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 21:44:05 +1000 Subject: [PATCH 15/23] Add gRPC identity header forwarding via configuration-snippet For gRPC backends (backend-protocol: GRPC), nginx uses grpc_pass instead of proxy_pass. The auth-response-headers annotation only sets proxy headers, not gRPC headers. This configuration-snippet bridges identity headers (X-User-Subject, X-User-Claim-Identitytype, etc.) from the auth subrequest response into the upstream gRPC request. This has been in BYOC since Oct 2024 (cdf8f5c6f6) but was never ported to the selfhosted/selfmanaged controlplane chart. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 65e818ae..ff1d9bac 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -253,6 +253,31 @@ ingress: nginx.ingress.kubernetes.io/auth-url: "http://flyteadmin.{{ template \"flyte.namespace\" . }}.svc.cluster.local/me" nginx.ingress.kubernetes.io/auth-response-headers: "Set-Cookie,X-User-Subject,X-User-Claim-Identitytype,X-User-Claim-Preferred-Username,X-User-Token" nginx.ingress.kubernetes.io/auth-cache-key: "$http_authorization$http_flyte_authorization$http_cookie" + # For gRPC backends (backend-protocol: GRPC), nginx uses grpc_pass instead + # of proxy_pass. The auth-response-headers annotation only sets proxy headers, + # not gRPC headers. This configuration-snippet bridges identity headers from + # the auth subrequest response into the upstream gRPC request so backend + # services receive the caller's identity. + nginx.ingress.kubernetes.io/configuration-snippet: | + auth_request_set $user_id $upstream_http_x_user_subject; + proxy_set_header X-User-Subject $user_id; + grpc_set_header X-User-Subject $user_id; + + auth_request_set $user_identitytype $upstream_http_x_user_claim_identitytype; + proxy_set_header X-User-Claim-Identitytype $user_identitytype; + grpc_set_header X-User-Claim-Identitytype $user_identitytype; + + auth_request_set $user_handle $upstream_http_x_user_claim_userhandle; + proxy_set_header X-User-Claim-userhandle $user_handle; + grpc_set_header X-User-Claim-userhandle $user_handle; + + auth_request_set $groups $upstream_http_x_user_claim_groups; + proxy_set_header X-User-Claim-groups $groups; + grpc_set_header X-User-Claim-groups $groups; + + more_set_headers "x-request-id: $request_id"; + proxy_set_header x-request-id $request_id; + grpc_set_header x-request-id $request_id; envoyGateway: # GatewayClass name for Envoy Gateway. Used when INGRESS_PROVIDER is "envoy" or "both". From 62b64069a21562750f0685046fe77742e9947dff Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 21:58:57 +1000 Subject: [PATCH 16/23] Add organizations service to controlplane chart The v2 SDK CreateRun path calls SettingsService/GetSettings (served by the organizations service) to resolve task resource defaults. This service exists in BYOC but was missing from selfhosted, causing "no children to pick from" when the executions service tried to reach it via internalConnectionConfig. Adds a minimal organizations service (cloudorganizations binary, shared DB, no cloud-specific features like externalIDProvisioning or Redis). Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index ff1d9bac..7536be97 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -566,6 +566,37 @@ services: secureTunnelTenantURLPattern: http://ingress-nginx-internal.ingress-nginx.svc.cluster.local:80 # http://ingress-nginx-internal.ingress-nginx.svc.cluster.local clusterSelector: type: local + organizations: + fullnameOverride: "organizations" + sharedService: + connectPort: 8081 + initContainers: + - name: migrate + args: + - cloudorganizations + - migrate + - --config + - "/etc/config/*.yaml" + args: + - cloudorganizations + - serve + - --config + - /etc/config/*.yaml + configMap: + sharedService: + metrics: + scope: "organizations:" + db: + dbname: '{{ .Values.global.DB_NAME }}' + host: '{{ .Values.global.DB_HOST }}' + username: '{{ .Values.global.DB_USER }}' + passwordPath: /etc/db/pass.txt + port: 5432 + connectionPool: + maxIdleConnections: 20 + maxOpenConnections: 20 + maxConnectionLifetime: 1m + executions: fullnameOverride: "executions" initContainers: From 2a1c4604ee0872acd42b8603fa566ee52cbab01b Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 22:02:49 +1000 Subject: [PATCH 17/23] Fix organizations service connectPort in configmap The connectPort must be in both the top-level sharedService (for the container port in the Deployment) and configMap.sharedService (for the application config). Without it in the configmap, the connect server defaults to port 8080 which conflicts with the gRPC server. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 7536be97..b0c14264 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -584,6 +584,7 @@ services: - /etc/config/*.yaml configMap: sharedService: + connectPort: 8081 metrics: scope: "organizations:" db: From 324ea4f77898d6aabd15e8ce2bf4669fb47079b0 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Fri, 17 Apr 2026 22:12:38 +1000 Subject: [PATCH 18/23] Use OIDC_S2S_SCOPE global for CP service-to-service auth scopes Entra ID requires api://{app}/.default for client_credentials grants. The hardcoded "all" scope works for Okta but fails for Entra with AADSTS1002012. Use the OIDC_S2S_SCOPE global (already set by terraform) with fallback to "all" for backwards compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/controlplane/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index b0c14264..1df42022 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -350,7 +350,7 @@ configMap: tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' authorizationMetadataKey: flyte-authorization scopes: - - all + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' services: artifacts: From 8e9167371e2e4e2fa9ae52f7e1247febdce964f0 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sat, 18 Apr 2026 08:50:56 +1000 Subject: [PATCH 19/23] Move executions adminClient.connection to base values.yaml The adminClient config was duplicated in the AWS and GCP selfhosted overlays. Since every field uses globals (FLYTEADMIN_ENDPOINT, INTERNAL_CLIENT_ID, AUTH_TOKEN_URL, OIDC_S2S_SCOPE), it belongs in the base chart values so terraform doesn't need deep-merge overrides just to set the S2S scope. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 18 ------------- .../values.gcp.selfhosted-intracluster.yaml | 18 ------------- charts/controlplane/values.yaml | 25 ++++++++++--------- 3 files changed, 13 insertions(+), 48 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index c490b5fd..6a2fd5fa 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -608,24 +608,6 @@ services: # Connect to dataplane ingress controller secureTunnelTenantURLPattern: '{{ .Values.global.DATAPLANE_ENDPOINT }}' - # Executions service configuration - executions: - configMap: - executions: - app: - adminClient: - connection: - # Flyteadmin endpoint for executions service - endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' - insecure: true - # --- Auth fields (active when OIDC is enabled) --- - authorizationHeader: "flyte-authorization" - clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' - clientSecretLocation: "/etc/secrets/union/client_secret" - tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: - - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' - # ---------------------------------------------------------------------------- # Monitoring Configuration (AWS/EKS specific) # ---------------------------------------------------------------------------- diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index 2e7274f9..d2d19558 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -617,24 +617,6 @@ services: # Connect to dataplane ingress controller secureTunnelTenantURLPattern: '{{ .Values.global.DATAPLANE_ENDPOINT }}' - # Executions service configuration - executions: - configMap: - executions: - app: - adminClient: - connection: - # Flyteadmin endpoint for executions service - endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' - insecure: true - # --- Auth fields (active when OIDC is enabled) --- - authorizationHeader: "flyte-authorization" - clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' - clientSecretLocation: "/etc/secrets/union/client_secret" - tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' - scopes: - - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' - # ---------------------------------------------------------------------------- # SECTION 9: ScyllaDB Configuration # ---------------------------------------------------------------------------- diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index 1df42022..ed21e1c0 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -636,18 +636,19 @@ services: eventsProxy: recorderType: RunService executions: - - # app: - # adminClient: - # connection: - # -- Override rootTenantURLPattern for adminClient to point to control plane service. - # endpoint: "" - - # -- Insecure should be true only for local testing with self-signed certs. - # insecure: true|false - - # -- Skip TLS verification for self-signed certs. Should be true only for local testing. - # insecureSkipVerify: true|false + app: + adminClient: + connection: + # TODO(FAB-195): Replace FLYTEADMIN_ENDPOINT with CONTROLPLANE_INTRA_CLUSTER_HOST + # so all S2S traffic routes through ingress (auth subrequests, TLS, etc.). + endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' + insecure: true + authorizationHeader: "flyte-authorization" + clientId: '{{ .Values.global.INTERNAL_CLIENT_ID }}' + clientSecretLocation: "/etc/secrets/union/client_secret" + tokenUrl: '{{ .Values.global.AUTH_TOKEN_URL }}' + scopes: + - '{{ default "all" .Values.global.OIDC_S2S_SCOPE }}' apps: # Enrich "Owned By" display with name/email from a remote identity service. From f375d556367c647e71acc5a7c1667032a8e97741 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sat, 18 Apr 2026 12:33:05 +1000 Subject: [PATCH 20/23] Add OIDC_BROWSER_SCOPE global for Entra browser auth Entra rejects /.default for same-app authorization_code flows (AADSTS90009). Browser login needs a specific delegated scope (/all) while task pods need /.default for client_credentials. Add OIDC_BROWSER_SCOPE to AWS and GCP overlay globals. The actual userAuth.openId.scopes injection is still done by terraform until the adminServer.auth block is fully extracted to base values.yaml. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 20 +++++++------------ .../values.gcp.selfhosted-intracluster.yaml | 20 +++++++------------ 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 6a2fd5fa..6adb3b37 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -171,6 +171,12 @@ global: # Entra ID example: "api://my-app-name" OIDC_APP_AUDIENCE: "" + # OAuth2 scope for browser authorization_code flow. + # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app + # authorization_code flows (AADSTS90009). For Okta, leave empty. + # Entra ID example: "api://my-app-name/all" + OIDC_BROWSER_SCOPE: "" + # OAuth2 scope for service-to-service authentication (client_credentials grant). # Used by internal controlplane services (App 3) and dataplane operator (App 4). # Okta: leave empty (defaults to "all", configured on the auth server). @@ -398,19 +404,7 @@ flyte: # See OIDC_APP_AUDIENCE global. audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' - # --- Browser Login (Flow 1: OIDC redirect for web console) --- - userAuth: - openId: - # OAuth App 1: Browser (confidential client, authorization_code grant) - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' - # Scopes requested during browser OIDC login. - # For Entra ID, append OIDC_APP_SCOPE here in your values overlay. - scopes: ["profile", "openid", "offline_access"] - cookieSetting: - sameSitePolicy: "LaxMode" - domain: "" - idpQueryParameter: "idp" + # userAuth.openId is now in the base values.yaml with OIDC_BROWSER_SCOPE global. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: null so the subchart does NOT create the secret — it must be diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index d2d19558..0e2c4197 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -179,6 +179,12 @@ global: # Entra ID example: "api://my-app-name" OIDC_APP_AUDIENCE: "" + # OAuth2 scope for browser authorization_code flow. + # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app + # authorization_code flows (AADSTS90009). For Okta, leave empty. + # Entra ID example: "api://my-app-name/all" + OIDC_BROWSER_SCOPE: "" + # OAuth2 scope for service-to-service authentication (client_credentials grant). # Used by internal controlplane services (App 3) and dataplane operator (App 4). # Okta: leave empty (defaults to "all", configured on the auth server). @@ -428,19 +434,7 @@ flyte: # See OIDC_APP_AUDIENCE global. audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' - # --- Browser Login (Flow 1: OIDC redirect for web console) --- - userAuth: - openId: - # OAuth App 1: Browser (confidential client, authorization_code grant) - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' - # Scopes requested during browser OIDC login. - # For Entra ID, append OIDC_APP_SCOPE here in your values overlay. - scopes: ["profile", "openid", "offline_access"] - cookieSetting: - sameSitePolicy: "LaxMode" - domain: "" - idpQueryParameter: "idp" + # userAuth.openId is now in the base values.yaml with OIDC_BROWSER_SCOPE global. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: "placeholder" so the subchart renders the secret — it must be From 94d845323c481420c6650a2ed38d783af5cac256 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sat, 18 Apr 2026 13:17:09 +1000 Subject: [PATCH 21/23] Consolidate auth config into base values.yaml with globals Move all OIDC/OAuth2 globals and the full adminServer.auth block from cloud-specific overlays to the base values.yaml. Every auth field is now either a static default or a global variable, eliminating the need for terraform deep merge overrides. New globals: OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID, OIDC_METADATA_URL, OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE, OIDC_BROWSER_SCOPE, OIDC_S2S_SCOPE, OIDC_SUBJECT_CLAIM_NAMES, OIDC_IDENTITY_TYPE_CLAIMS, INTERNAL_SUBJECT_ID. Also moves configMap.union.connection.trustedIdentityClaims to base using INTERNAL_SUBJECT_ID (defaults to INTERNAL_CLIENT_ID). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../values.aws.selfhosted-intracluster.yaml | 154 +---------------- .../values.gcp.selfhosted-intracluster.yaml | 156 +----------------- charts/controlplane/values.yaml | 120 +++++++++++--- 3 files changed, 111 insertions(+), 319 deletions(-) diff --git a/charts/controlplane/values.aws.selfhosted-intracluster.yaml b/charts/controlplane/values.aws.selfhosted-intracluster.yaml index 6adb3b37..c8fb66f5 100644 --- a/charts/controlplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.aws.selfhosted-intracluster.yaml @@ -108,83 +108,9 @@ global: DATAPLANE_ENDPOINT: "" # --- Authentication Configuration --- - # Configure your OAuth2/OIDC identity provider below. - # Supports any OIDC-compliant provider (Okta, Azure AD / Entra ID, Keycloak, etc.) - # - # Required for all providers: - # OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID - # INTERNAL_CLIENT_ID, AUTH_TOKEN_URL (in base values.yaml) - # - # Provider-specific (may be required depending on your IdP): - # OIDC_METADATA_URL, OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE - # - # See also: flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps - # for IdP-specific identity type claim mapping (set in values overlay, not as a global). - - # OIDC issuer URL (authorization server base URL). - # This is the base URL for token validation, JWKS discovery, and user info. - # Okta example: "https://dev-123456.okta.com/oauth2/default" - # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" - OIDC_BASE_URL: "" - - # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). - # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. - # Most providers support one or both of these endpoints: - # ".well-known/oauth-authorization-server" — RFC 8414 (Okta) - # ".well-known/openid-configuration" — OpenID Connect Discovery (Entra ID, Keycloak) - # Default: ".well-known/oauth-authorization-server" - OIDC_METADATA_URL: ".well-known/oauth-authorization-server" - - # OAuth2 client ID for the browser/web UI login app (confidential client, - # authorization_code grant). This is the "flyteadmin" or "browser" app - # in your IdP's OAuth application configuration. - # Okta example: "0oa1abc2def3ghi4j5k6" - # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" - OIDC_CLIENT_ID: "" - - # OAuth2 client ID for the CLI/SDK app (public client, PKCE flow). - # Used by flytectl, uctl, and the Flyte SDK for interactive authentication. - # Okta example: "0oa7mno8pqr9stu0v1w2" - # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" - CLI_CLIENT_ID: "" - - # Allowed JWT audiences for access token validation. - # Flyteadmin checks the access token "aud" claim against this list. - # When empty, defaults to ["https://{UNION_HOST}"] (the deployment domain). - # Override for IdPs that use different audience formats in access tokens. - # Okta: typically uses the auth server issuer URL (leave empty to use default). - # Entra ID example: ["api://my-app-name", "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12"] - OIDC_ALLOWED_AUDIENCE: [] - - # OAuth2 resource scope for the flyteadmin app. - # When set, this scope is requested during browser login and CLI PKCE flows - # so the IdP returns access tokens scoped to your app (correct audience). - # Without this, some IdPs (notably Entra ID) return generic access tokens - # with the wrong audience, causing access token validation to fail. - # Okta: leave empty (Okta scopes are configured on the auth server). - # Entra ID example: "api://my-app-name/all" - OIDC_APP_SCOPE: "" - - # Audience identifier for the CLI/SDK PKCE flow. - # Some IdPs require an explicit audience parameter in the authorization request. - # Okta: leave empty (derived from auth server). - # Entra ID example: "api://my-app-name" - OIDC_APP_AUDIENCE: "" - - # OAuth2 scope for browser authorization_code flow. - # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app - # authorization_code flows (AADSTS90009). For Okta, leave empty. - # Entra ID example: "api://my-app-name/all" - OIDC_BROWSER_SCOPE: "" - - # OAuth2 scope for service-to-service authentication (client_credentials grant). - # Used by internal controlplane services (App 3) and dataplane operator (App 4). - # Okta: leave empty (defaults to "all", configured on the auth server). - # Entra ID example: "api://my-app-name/.default" - # Entra requires /.default suffix for client_credentials grants. - OIDC_S2S_SCOPE: "" - - # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. + # All OIDC/OAuth2 globals are defined in the base values.yaml with documentation. + # Set them in your environment-specific values overlay generated by Terraform. + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are also in the base values.yaml. # Set them in your environment-specific values overlay. # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. @@ -333,78 +259,8 @@ flyte: # Subject to removal in the future singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' - # --- OIDC Authentication --- - # Flyteadmin acts as both the OAuth2 resource server (validates access tokens) - # and the OIDC relying party (browser login flow). Configure the globals in - # Section 1 above, then enable auth: - # - # server: - # security: - # useAuth: true - # - # The sections below map to different authentication flows: - # - # auth.appAuth.externalAuthServer - # Resource server config — validates access tokens from ALL flows. - # Uses OIDC_BASE_URL for JWKS discovery and token validation. - # Uses: OAuth Apps 1-5 (all tokens are validated here). - # Flows: Browser login, CLI/SDK PKCE, service-to-service. - # - # auth.appAuth.thirdPartyConfig.flyteClient - # CLI/SDK PKCE client config — returned by GetPublicClientConfig RPC. - # The SDK/CLI uses this to initiate the PKCE authorization flow. - # Uses: OAuth App 2 (CLI — public client). - # Flow: CLI/SDK PKCE (Flow 2). - # - # auth.userAuth.openId - # Browser login config — OIDC redirect flow for web console. - # Uses: OAuth App 1 (Browser — confidential client). - # Flow: Browser login (Flow 1). - # - # Service-to-service auth (OAuth Apps 3-5) is configured separately: - # - INTERNAL_CLIENT_ID + AUTH_TOKEN_URL in globals (App 3) - # - Operator and EAGER credentials in dataplane values (Apps 4, 5) - auth: - # Custom authorization header name. All services use this instead of - # the standard "authorization" header to avoid conflicts with service - # meshes (e.g. Linkerd, Istio) that intercept the default header. - httpAuthorizationHeader: "flyte-authorization" - grpcAuthorizationHeader: "flyte-authorization" - # URIs that flyteadmin accepts as valid audiences in its own tokens. - authorizedUris: - - "http://flyteadmin:80" - - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' - - # --- Resource Server (validates access tokens from all flows) --- - appAuth: - authServerType: "External" - externalAuthServer: - # OIDC issuer for JWKS discovery and token validation. - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - # Metadata discovery endpoint (see OIDC_METADATA_URL global). - metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' - # allowedAudience: list of accepted JWT audiences for access token validation. - # Set in your environment-specific values overlay (list type, can't use a - # single global). Default: ["https://{domain}"]. - # Override via OIDC_ALLOWED_AUDIENCE or in your values overlay. - - # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- - # Returned by the GetPublicClientConfig RPC. The SDK reads this to - # know which client ID, scopes, and audience to use for PKCE auth. - thirdPartyConfig: - flyteClient: - # OAuth App 2: CLI (public client, PKCE flow) - clientId: '{{ .Values.global.CLI_CLIENT_ID }}' - redirectUri: "http://localhost:53593/callback" - # Resource scope — determines the audience of the access token. - # See OIDC_APP_SCOPE global. - scopes: - - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' - # Audience parameter for the authorization request. - # See OIDC_APP_AUDIENCE global. - audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' - - # userAuth.openId is now in the base values.yaml with OIDC_BROWSER_SCOPE global. + # adminServer.auth is now fully configured in the base values.yaml + # using globals. No overlay-specific auth config needed. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: null so the subchart does NOT create the secret — it must be diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml index 0e2c4197..4dba7891 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.selfhosted-intracluster.yaml @@ -116,85 +116,9 @@ global: IMAGE_REPOSITORY_PREFIX: "registry.unionai.cloud/controlplane" # --- Authentication Configuration --- - # Configure your OAuth2/OIDC identity provider below. - # Supports any OIDC-compliant provider (Okta, Azure AD / Entra ID, Keycloak, etc.) - # - # Required for all providers: - # OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID - # INTERNAL_CLIENT_ID, AUTH_TOKEN_URL (in base values.yaml) - # - # Provider-specific (may be required depending on your IdP): - # OIDC_METADATA_URL, OIDC_ALLOWED_AUDIENCE, OIDC_APP_SCOPE, OIDC_APP_AUDIENCE - # - # See also: flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps - # for IdP-specific identity type claim mapping (set in values overlay, not as a global). - - # OIDC issuer URL (authorization server base URL). - # This is the base URL for token validation, JWKS discovery, and user info. - # Okta example: "https://dev-123456.okta.com/oauth2/default" - # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" - OIDC_BASE_URL: "" - - # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). - # Flyteadmin resolves this against OIDC_BASE_URL to fetch JWKS and token endpoints. - # Most providers support one or both of these endpoints: - # ".well-known/oauth-authorization-server" — RFC 8414 (Okta) - # ".well-known/openid-configuration" — OpenID Connect Discovery (Entra ID, Keycloak) - # Default: ".well-known/oauth-authorization-server" - OIDC_METADATA_URL: ".well-known/oauth-authorization-server" - - # OAuth2 client ID for the browser/web UI login app (confidential client, - # authorization_code grant). This is the "flyteadmin" or "browser" app - # in your IdP's OAuth application configuration. - # Okta example: "0oa1abc2def3ghi4j5k6" - # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" - OIDC_CLIENT_ID: "" - - # OAuth2 client ID for the CLI/SDK app (public client, PKCE flow). - # Used by flytectl, uctl, and the Flyte SDK for interactive authentication. - # Okta example: "0oa7mno8pqr9stu0v1w2" - # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" - CLI_CLIENT_ID: "" - - # Allowed JWT audiences for access token validation. - # Flyteadmin checks the access token "aud" claim against this list. - # When empty, defaults to ["https://{UNION_HOST}"] (the deployment domain). - # Override for IdPs that use different audience formats in access tokens. - # Okta: typically uses the auth server issuer URL (leave empty to use default). - # Entra ID example: ["api://my-app-name", "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12"] - OIDC_ALLOWED_AUDIENCE: [] - - # OAuth2 resource scope for the flyteadmin app. - # When set, this scope is requested during browser login and CLI PKCE flows - # so the IdP returns access tokens scoped to your app (correct audience). - # Without this, some IdPs (notably Entra ID) return generic access tokens - # with the wrong audience, causing access token validation to fail. - # Okta: leave empty (Okta scopes are configured on the auth server). - # Entra ID example: "api://my-app-name/all" - OIDC_APP_SCOPE: "" - - # Audience identifier for the CLI/SDK PKCE flow. - # Some IdPs require an explicit audience parameter in the authorization request. - # Okta: leave empty (derived from auth server). - # Entra ID example: "api://my-app-name" - OIDC_APP_AUDIENCE: "" - - # OAuth2 scope for browser authorization_code flow. - # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app - # authorization_code flows (AADSTS90009). For Okta, leave empty. - # Entra ID example: "api://my-app-name/all" - OIDC_BROWSER_SCOPE: "" - - # OAuth2 scope for service-to-service authentication (client_credentials grant). - # Used by internal controlplane services (App 3) and dataplane operator (App 4). - # Okta: leave empty (defaults to "all", configured on the auth server). - # Entra ID example: "api://my-app-name/.default" - # Entra requires /.default suffix for client_credentials grants. - OIDC_S2S_SCOPE: "" - - # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are defined in the base values.yaml. - # Set them in your environment-specific values overlay. - # INTERNAL_CLIENT_ID: OAuth2 client ID for service-to-service calls (client_credentials). + # All OIDC/OAuth2 globals are defined in the base values.yaml with documentation. + # Set them in your environment-specific values overlay generated by Terraform. + # INTERNAL_CLIENT_ID and AUTH_TOKEN_URL are also in the base values.yaml. # AUTH_TOKEN_URL: Token endpoint for service-to-service authentication. # ---------------------------------------------------------------------------- @@ -363,78 +287,8 @@ flyte: # Subject to removal in the future singleTenantOrgID: '{{ .Values.global.UNION_ORG }}' - # --- OIDC Authentication --- - # Flyteadmin acts as both the OAuth2 resource server (validates access tokens) - # and the OIDC relying party (browser login flow). Configure the globals in - # Section 1 above, then enable auth: - # - # server: - # security: - # useAuth: true - # - # The sections below map to different authentication flows: - # - # auth.appAuth.externalAuthServer - # Resource server config — validates access tokens from ALL flows. - # Uses OIDC_BASE_URL for JWKS discovery and token validation. - # Uses: OAuth Apps 1-5 (all tokens are validated here). - # Flows: Browser login, CLI/SDK PKCE, service-to-service. - # - # auth.appAuth.thirdPartyConfig.flyteClient - # CLI/SDK PKCE client config — returned by GetPublicClientConfig RPC. - # The SDK/CLI uses this to initiate the PKCE authorization flow. - # Uses: OAuth App 2 (CLI — public client). - # Flow: CLI/SDK PKCE (Flow 2). - # - # auth.userAuth.openId - # Browser login config — OIDC redirect flow for web console. - # Uses: OAuth App 1 (Browser — confidential client). - # Flow: Browser login (Flow 1). - # - # Service-to-service auth (OAuth Apps 3-5) is configured separately: - # - INTERNAL_CLIENT_ID + AUTH_TOKEN_URL in globals (App 3) - # - Operator and EAGER credentials in dataplane values (Apps 4, 5) - auth: - # Custom authorization header name. All services use this instead of - # the standard "authorization" header to avoid conflicts with service - # meshes (e.g. Linkerd, Istio) that intercept the default header. - httpAuthorizationHeader: "flyte-authorization" - grpcAuthorizationHeader: "flyte-authorization" - # URIs that flyteadmin accepts as valid audiences in its own tokens. - authorizedUris: - - "http://flyteadmin:80" - - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' - - # --- Resource Server (validates access tokens from all flows) --- - appAuth: - authServerType: "External" - externalAuthServer: - # OIDC issuer for JWKS discovery and token validation. - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - # Metadata discovery endpoint (see OIDC_METADATA_URL global). - metadataUrl: '{{ .Values.global.OIDC_METADATA_URL }}' - # allowedAudience: list of accepted JWT audiences for access token validation. - # Set in your environment-specific values overlay (list type, can't use a - # single global). Default: ["https://{domain}"]. - # Override via OIDC_ALLOWED_AUDIENCE or in your values overlay. - - # --- CLI/SDK PKCE Client (Flow 2: SDK/CLI authentication) --- - # Returned by the GetPublicClientConfig RPC. The SDK reads this to - # know which client ID, scopes, and audience to use for PKCE auth. - thirdPartyConfig: - flyteClient: - # OAuth App 2: CLI (public client, PKCE flow) - clientId: '{{ .Values.global.CLI_CLIENT_ID }}' - redirectUri: "http://localhost:53593/callback" - # Resource scope — determines the audience of the access token. - # See OIDC_APP_SCOPE global. - scopes: - - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' - # Audience parameter for the authorization request. - # See OIDC_APP_AUDIENCE global. - audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' - - # userAuth.openId is now in the base values.yaml with OIDC_BROWSER_SCOPE global. + # adminServer.auth is now fully configured in the base values.yaml + # using globals. No overlay-specific auth config needed. # Enable scheduler auth secret mount so flyte-secret-auth is mounted at /etc/secrets/. # Set clientSecret: "placeholder" so the subchart renders the secret — it must be diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index ed21e1c0..ec2d2bc9 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -68,6 +68,63 @@ global: # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/oauth2/v2.0/token" AUTH_TOKEN_URL: "" + # --- OIDC / OAuth2 Authentication --- + # Configure your identity provider (Okta, Entra ID, Keycloak, Authentik, etc.) + # All globals below have safe defaults for Okta. Override for other IdPs. + + # OIDC issuer URL (authorization server base URL). + # Okta example: "https://dev-123456.okta.com/oauth2/default" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" + OIDC_BASE_URL: "" + + # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). + # Default: ".well-known/oauth-authorization-server" (RFC 8414, Okta) + # Entra ID: ".well-known/openid-configuration" + OIDC_METADATA_URL: ".well-known/oauth-authorization-server" + + # OAuth2 client ID for the browser/web UI login (confidential client, authorization_code). + OIDC_CLIENT_ID: "" + + # OAuth2 client ID for CLI/SDK (public client, PKCE flow). + CLI_CLIENT_ID: "" + + # Allowed JWT audiences — set in values overlay (list type, can't be a tpl global). + # Default in base chart: ["https://{UNION_HOST}"]. + # Override: set flyte.configmap.adminServer.auth.appAuth.externalAuthServer.allowedAudience + # Entra ID example: ["api://my-app-name", "f0b2667d-..."] + + # OAuth2 resource scope for CLI/SDK and task pod authentication. + # Used in flyteClient.scopes (returned by GetPublicClientConfig RPC). + # Okta: leave empty (defaults to "all"). Entra ID: "api://my-app-name/.default" + OIDC_APP_SCOPE: "" + + # Audience parameter for CLI/SDK PKCE authorization requests. + # Okta: leave empty. Entra ID: "api://my-app-name" + OIDC_APP_AUDIENCE: "" + + # OAuth2 scope for browser authorization_code flow (web console login). + # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app + # authorization_code flows (AADSTS90009). + # Okta: leave empty. Entra ID: "api://my-app-name/all" + OIDC_BROWSER_SCOPE: "" + + # OAuth2 scope for service-to-service authentication (client_credentials grant). + # Okta: leave empty (defaults to "all"). Entra ID: "api://my-app-name/.default" + OIDC_S2S_SCOPE: "" + + # Subject claim resolution and identity type mapping — set in values overlay + # (list/map types, can't be tpl globals). + # Override: set flyte.configmap.adminServer.auth.appAuth.subjectClaimNames + # and flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps + # Entra ID examples: subjectClaimNames: ["sub", "client_id"] + # identityTypeClaimsForApps: {idtyp: [app]} + + # Subject ID for trusted internal service-to-service identity. + # The JWT "sub" claim value from client_credentials tokens for the internal client. + # Default: falls back to INTERNAL_CLIENT_ID (correct for Okta where sub == client_id). + # Entra ID: Service Principal Object ID (differs from client_id). + INTERNAL_SUBJECT_ID: "" + # ---------------------------------------------------------------------------- # Additional Configuration # ---------------------------------------------------------------------------- @@ -339,6 +396,11 @@ configMap: legacyHosts: - '{{ .Values.global.UNION_ORG }}' union: + connection: + trustedIdentityClaims: + enabled: true + externalIdentityClaim: '{{ default .Values.global.INTERNAL_CLIENT_ID .Values.global.INTERNAL_SUBJECT_ID }}' + externalIdentityTypeClaim: "app" internalConnectionConfig: enabled: true urlPattern: "_SERVICE_.{{ .Release.Namespace }}.svc.cluster.local:80" @@ -1270,24 +1332,44 @@ flyte: # template: "{{ project }}-{{ domain }}" adminServer: - # --- Subject Claim Resolution (FAB-205) --- - # When flyteadmin validates a JWT access token, it resolves the caller's subject - # from JWT claims. By default it reads the standard "sub" claim. Some IdPs omit - # "sub" from client credentials tokens or use a different claim for the client - # identity (e.g. "client_id"). - # - # When subjectClaimNames is configured under auth.appAuth, it becomes the - # authoritative ordered list of claims to try. The first non-empty value wins - # and is used as the subject for all downstream identity resolution (including - # /me, authorization middleware, and internal service-to-service calls). - # - # When not configured, the standard "sub" claim is used (default behavior). - # - # auth: - # appAuth: - # subjectClaimNames: - # - sub - # - client_id # Common alternative for client credentials tokens + server: + security: + useAuth: true + auth: + httpAuthorizationHeader: "flyte-authorization" + grpcAuthorizationHeader: "flyte-authorization" + authorizedUris: + - "http://flyteadmin:80" + - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' + - 'https://{{ .Values.global.UNION_HOST }}' + appAuth: + authServerType: "External" + externalAuthServer: + baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + metadataUrl: '{{ default ".well-known/oauth-authorization-server" .Values.global.OIDC_METADATA_URL }}' + # allowedAudience defaults to ["https://{UNION_HOST}"]. + # Override via OIDC_ALLOWED_AUDIENCE global or values overlay. + allowedAudience: + - 'https://{{ .Values.global.UNION_HOST }}' + thirdPartyConfig: + flyteClient: + clientId: '{{ .Values.global.CLI_CLIENT_ID }}' + redirectUri: "http://localhost:53593/callback" + scopes: + - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' + audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' + userAuth: + openId: + baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' + clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' + scopes: + - profile + - openid + - offline_access + cookieSetting: + sameSitePolicy: "LaxMode" + domain: "" + idpQueryParameter: "idp" admin: endpoint: 'dns:///{{ .Values.global.UNION_HOST }}' @@ -1330,7 +1412,7 @@ flyte: populateUserFields: false server: security: - useAuth: false + useAuth: true union: internalConnectionConfig: enabled: true From 26ecbd64543b0a16009d0ca3a29bcf0c85f08c02 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sun, 19 Apr 2026 09:14:06 +1000 Subject: [PATCH 22/23] Update authorizer dashboard: standardized backend metrics, identity_type - Replace External-specific panels with backend-agnostic panels - Backend Latency uses backend_authorize_duration_ms (works for all types) - Backend Errors uses backend_authorize_errors (works for all types) - Allow/Deny Rate now shows identity_type breakdown (user/app/external) - Authorizer Mode shows authz_type_info{type="Union"} - Consistent metric prefix (no type-specific sub-scope) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-overview.json | 647 +++++++++++------- 1 file changed, 406 insertions(+), 241 deletions(-) diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index 0106ea99..e37e0cb9 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -1042,7 +1042,7 @@ "y": 33 }, "id": 300, - "title": "FlyteAdmin (V1 + V2)", + "title": "FlyteAdmin", "type": "row", "panels": [ { @@ -1301,7 +1301,7 @@ "y": 34 }, "id": 400, - "title": "Executions (V1 + V2)", + "title": "Executions", "type": "row", "panels": [ { @@ -1807,7 +1807,7 @@ "y": 34 }, "id": 500, - "title": "Queue / Run-Scheduler (V2)", + "title": "Queue / Run-Scheduler", "type": "row", "panels": [ { @@ -2224,7 +2224,7 @@ "y": 35 }, "id": 600, - "title": "Cluster Service (V1 + V2)", + "title": "Cluster Service", "type": "row", "panels": [ { @@ -2548,7 +2548,7 @@ "y": 36 }, "id": 900, - "title": "CacheService (V1 + V2)", + "title": "CacheService", "type": "row", "panels": [ { @@ -2656,401 +2656,566 @@ "y": 36 }, "id": 750, - "title": "Authorizer (V1 + V2)", + "title": "Authorizer", "type": "row", "panels": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" + "id": 760, + "title": "Authorizer Mode", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 15 + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "name", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, - "id": 751, - "title": "Allow / Deny Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Allowed", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", + "legendFormat": "{{type}}", "refId": "A" - }, - { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Denied", - "refId": "B" } ], - "description": "Authorization decision rate. Allow/deny ratio indicates auth health. High deny rate may signal misconfigured policies. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 751, + "title": "Allow / Deny Rate", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 4, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "spanNulls": false }, - "unit": "ms" - } + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*denied.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*allowed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 15 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi" + } }, - "id": 752, - "title": "Authorize Latency", - "type": "timeseries", "targets": [ { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "allowed ({{identity_type}})", "refId": "A" }, { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "denied ({{identity_type}})", "refId": "B" - }, - { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "End-to-end Authorize() latency including identity resolution and backend authorization check. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 753, + "title": "Deny Rate (%)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 14, + "y": 37 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "unit": "percentunit", "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "percentunit" - } + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "noValue": "0", + "decimals": 1, + "min": 0, + "max": 1 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 15 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 753, - "title": "Deny Rate (%)", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]) / (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]) + rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Deny %", + "expr": "(sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)) / clamp_min((sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])) + sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))), 1e-10)", + "legendFormat": "{{identity_type}}", "refId": "A" } ], - "description": "Percentage of authorization decisions that denied access. Spikes indicate policy changes or auth issues. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 752, + "title": "Authorize Latency (service)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" + "unit": "ms", + "custom": { + "drawStyle": "line", + "fillOpacity": 10 }, + "noValue": "0", + "decimals": 1, "thresholds": { + "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 } ] - }, - "mappings": [ - { - "type": "value", - "options": { - "Noop": { "text": "Noop", "index": 0 }, - "noop": { "text": "Noop", "index": 1 }, - "UserClouds": { "text": "UserClouds", "index": 2 }, - "userclouds": { "text": "UserClouds", "index": 3 }, - "External": { "text": "External", "index": 4 }, - "external": { "text": "External", "index": 5 }, - "Authorizer": { "text": "Authorizer", "index": 6 }, - "authorizer": { "text": "Authorizer", "index": 7 } - } - } - ] - } - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 0, - "y": 23 + } + }, + "overrides": [] }, - "id": 760, "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^type$/" + "legend": { + "displayMode": "list", + "placement": "bottom" }, - "textMode": "value" + "tooltip": { + "mode": "multi" + } }, - "title": "Authorizer Mode", - "type": "stat", "targets": [ { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", - "legendFormat": "{{ type }}", + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", "refId": "A" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Currently active authorizer backend type (Noop, UserClouds, External, Authorizer)." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 761, + "title": "Backend Latency", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "unit": "ms", "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ms" - } + "noValue": "0", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 8, - "x": 4, - "y": 23 + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 761, - "title": "External Backend Latency", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p95", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": "p99", "refId": "C" } ], - "description": "Latency of calls to the external authorization backend (p50/p95/p99)." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 764, + "title": "Decisions by Action", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 45 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { + "mode": "normal" + } }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 23 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "sum" + ] + }, + "tooltip": { + "mode": "multi" + } }, - "id": 762, - "title": "External Errors by gRPC Code", - "type": "timeseries", "targets": [ { - "expr": "sum by (grpc_code) (rate(authorizer:authorizer:cloudauthorizer:connect:external:errors{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_code }}", + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (allowed)", "refId": "A" + }, + { + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (denied)", + "refId": "B" } ], - "description": "Error rate from the external authorization backend, broken down by gRPC status code." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 762, + "title": "Backend Errors", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 23 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 763, - "title": "Fail-Open Activations", - "type": "timeseries", "targets": [ { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Fail-Open", + "expr": "sum by (error_type) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_type}}", "refId": "A" } ], - "description": "Rate of fail-open activations. Non-zero means the external backend is unreachable and requests are being allowed without authorization." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 765, + "title": "Error Attribution", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } + "fillOpacity": 10 }, - "unit": "ops" - } + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 31 + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 764, - "title": "Decisions by Action", - "type": "timeseries", "targets": [ { - "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "allowed: {{ action }}", + "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_source}}", "refId": "A" - }, - { - "expr": "sum by (action) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "denied: {{ action }}", - "refId": "B" } ], - "description": "Authorization decisions broken down by action (e.g. read, write, execute). Stacked to show total volume." - }, - { "datasource": { "type": "prometheus", "uid": "${datasource}" + } + }, + { + "id": 763, + "title": "Fail-Open Activations", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 53 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" + "fillOpacity": 10 }, - "unit": "ops" - } + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 31 + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, - "id": 765, - "title": "Error Attribution", - "type": "timeseries", "targets": [ { - "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ error_source }}", + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval]) or vector(0)", + "legendFormat": "fail-open", "refId": "A" } ], - "description": "Authorization errors attributed by source (e.g. identity resolution, backend, policy evaluation)." + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ] }, @@ -3570,4 +3735,4 @@ "title": "Union Controlplane Overview", "uid": "union-cp-overview", "version": 2 -} +} \ No newline at end of file From d93edb8eabc6eb1eb90f244e7a5e8b8436a2e687 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sun, 19 Apr 2026 14:22:16 +1000 Subject: [PATCH 23/23] Consolidate auth config into adminServer.auth block Move all OIDC auth configuration from scattered globals into one documented flyte.configmap.adminServer.auth block with inline Okta/Entra ID examples. - Deprecate auth globals: OIDC_BASE_URL, OIDC_CLIENT_ID, CLI_CLIENT_ID - Remove new globals: OIDC_METADATA_URL, OIDC_APP_SCOPE, etc. - Keep S2S globals: INTERNAL_CLIENT_ID, AUTH_TOKEN_URL, OIDC_S2S_SCOPE - Auth block uses literal values, not tpl global references - Authn modules output complete appAuth/userAuth blocks - Authz templates accept both "Union" and "UserClouds" type - Dashboard: standardized backend metrics, identity_type, V1+V2 removed Co-Authored-By: Claude Opus 4.6 (1M context) --- .../templates/authz/configmap.yaml | 2 +- .../templates/authz/deployment.yaml | 2 +- charts/controlplane/templates/authz/hpa.yaml | 2 +- .../templates/authz/networkpolicy.yaml | 2 +- charts/controlplane/templates/authz/pdb.yaml | 2 +- charts/controlplane/templates/authz/rbac.yaml | 2 +- .../controlplane/templates/authz/service.yaml | 2 +- .../templates/authz/serviceaccount.yaml | 2 +- charts/controlplane/values.yaml | 125 ++++++++++-------- 9 files changed, 75 insertions(+), 66 deletions(-) diff --git a/charts/controlplane/templates/authz/configmap.yaml b/charts/controlplane/templates/authz/configmap.yaml index 37fdda2f..5dbf8651 100644 --- a/charts/controlplane/templates/authz/configmap.yaml +++ b/charts/controlplane/templates/authz/configmap.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/controlplane/templates/authz/deployment.yaml b/charts/controlplane/templates/authz/deployment.yaml index d93e44a6..d4419f40 100644 --- a/charts/controlplane/templates/authz/deployment.yaml +++ b/charts/controlplane/templates/authz/deployment.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/charts/controlplane/templates/authz/hpa.yaml b/charts/controlplane/templates/authz/hpa.yaml index 9f8de177..41fe2809 100644 --- a/charts/controlplane/templates/authz/hpa.yaml +++ b/charts/controlplane/templates/authz/hpa.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.autoscaling.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.autoscaling.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: diff --git a/charts/controlplane/templates/authz/networkpolicy.yaml b/charts/controlplane/templates/authz/networkpolicy.yaml index 3105c0c6..87d6e18e 100644 --- a/charts/controlplane/templates/authz/networkpolicy.yaml +++ b/charts/controlplane/templates/authz/networkpolicy.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.networkPolicy.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.networkPolicy.enabled }} apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: diff --git a/charts/controlplane/templates/authz/pdb.yaml b/charts/controlplane/templates/authz/pdb.yaml index 42e1a421..71bb7229 100644 --- a/charts/controlplane/templates/authz/pdb.yaml +++ b/charts/controlplane/templates/authz/pdb.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.pdb.enabled }} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.pdb.enabled }} apiVersion: policy/v1 kind: PodDisruptionBudget metadata: diff --git a/charts/controlplane/templates/authz/rbac.yaml b/charts/controlplane/templates/authz/rbac.yaml index d681d292..bd7fbf8d 100644 --- a/charts/controlplane/templates/authz/rbac.yaml +++ b/charts/controlplane/templates/authz/rbac.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.serviceAccount.create -}} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.serviceAccount.create -}} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: diff --git a/charts/controlplane/templates/authz/service.yaml b/charts/controlplane/templates/authz/service.yaml index 79cfa05f..a969eddc 100644 --- a/charts/controlplane/templates/authz/service.yaml +++ b/charts/controlplane/templates/authz/service.yaml @@ -1,4 +1,4 @@ -{{- if eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds" -}} +{{- if or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") -}} apiVersion: v1 kind: Service metadata: diff --git a/charts/controlplane/templates/authz/serviceaccount.yaml b/charts/controlplane/templates/authz/serviceaccount.yaml index 2b3fab43..971dc7fc 100644 --- a/charts/controlplane/templates/authz/serviceaccount.yaml +++ b/charts/controlplane/templates/authz/serviceaccount.yaml @@ -1,4 +1,4 @@ -{{- if and (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds") .Values.union.authz.serviceAccount.create -}} +{{- if and (or (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "Union") (eq ((index .Values "services" "authorizer" "configMap" "authorizer" "type") | default "") "UserClouds")) .Values.union.authz.serviceAccount.create -}} apiVersion: v1 kind: ServiceAccount metadata: diff --git a/charts/controlplane/values.yaml b/charts/controlplane/values.yaml index ec2d2bc9..4c983ecc 100644 --- a/charts/controlplane/values.yaml +++ b/charts/controlplane/values.yaml @@ -69,62 +69,22 @@ global: AUTH_TOKEN_URL: "" # --- OIDC / OAuth2 Authentication --- - # Configure your identity provider (Okta, Entra ID, Keycloak, Authentik, etc.) - # All globals below have safe defaults for Okta. Override for other IdPs. + # Auth configuration is in flyte.configmap.adminServer.auth (see below). + # Set auth fields directly in that block or via your values overlay. + # The globals below are kept for backward compatibility only. - # OIDC issuer URL (authorization server base URL). - # Okta example: "https://dev-123456.okta.com/oauth2/default" - # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" + # Deprecated: set flyte.configmap.adminServer.auth.appAuth.externalAuthServer.baseUrl instead. OIDC_BASE_URL: "" - - # OIDC metadata discovery endpoint (relative to OIDC_BASE_URL). - # Default: ".well-known/oauth-authorization-server" (RFC 8414, Okta) - # Entra ID: ".well-known/openid-configuration" - OIDC_METADATA_URL: ".well-known/oauth-authorization-server" - - # OAuth2 client ID for the browser/web UI login (confidential client, authorization_code). + # Deprecated: set flyte.configmap.adminServer.auth.userAuth.openId.clientId instead. OIDC_CLIENT_ID: "" - - # OAuth2 client ID for CLI/SDK (public client, PKCE flow). + # Deprecated: set flyte.configmap.adminServer.auth.appAuth.thirdPartyConfig.flyteClient.clientId instead. CLI_CLIENT_ID: "" - # Allowed JWT audiences — set in values overlay (list type, can't be a tpl global). - # Default in base chart: ["https://{UNION_HOST}"]. - # Override: set flyte.configmap.adminServer.auth.appAuth.externalAuthServer.allowedAudience - # Entra ID example: ["api://my-app-name", "f0b2667d-..."] - - # OAuth2 resource scope for CLI/SDK and task pod authentication. - # Used in flyteClient.scopes (returned by GetPublicClientConfig RPC). - # Okta: leave empty (defaults to "all"). Entra ID: "api://my-app-name/.default" - OIDC_APP_SCOPE: "" - - # Audience parameter for CLI/SDK PKCE authorization requests. - # Okta: leave empty. Entra ID: "api://my-app-name" - OIDC_APP_AUDIENCE: "" - - # OAuth2 scope for browser authorization_code flow (web console login). - # Separate from OIDC_APP_SCOPE because Entra rejects /.default for same-app - # authorization_code flows (AADSTS90009). - # Okta: leave empty. Entra ID: "api://my-app-name/all" - OIDC_BROWSER_SCOPE: "" - # OAuth2 scope for service-to-service authentication (client_credentials grant). + # Used by configMap.union.auth and executions.adminClient.connection (S2S concern). # Okta: leave empty (defaults to "all"). Entra ID: "api://my-app-name/.default" OIDC_S2S_SCOPE: "" - # Subject claim resolution and identity type mapping — set in values overlay - # (list/map types, can't be tpl globals). - # Override: set flyte.configmap.adminServer.auth.appAuth.subjectClaimNames - # and flyte.configmap.adminServer.auth.appAuth.identityTypeClaimsForApps - # Entra ID examples: subjectClaimNames: ["sub", "client_id"] - # identityTypeClaimsForApps: {idtyp: [app]} - - # Subject ID for trusted internal service-to-service identity. - # The JWT "sub" claim value from client_credentials tokens for the internal client. - # Default: falls back to INTERNAL_CLIENT_ID (correct for Okta where sub == client_id). - # Entra ID: Service Principal Object ID (differs from client_id). - INTERNAL_SUBJECT_ID: "" - # ---------------------------------------------------------------------------- # Additional Configuration # ---------------------------------------------------------------------------- @@ -343,7 +303,7 @@ envoyGateway: enabled: false requestsPerUnit: 100 unit: Second - + # -- Central logging configuration. All controlplane services pull their log level from here. # Go services use level 1–6 (1=least verbose, 6=most verbose; 4=INFO, 6=DEBUG). # Log format options: json, text, gcp @@ -397,9 +357,12 @@ configMap: - '{{ .Values.global.UNION_ORG }}' union: connection: + # Overridden by terraform from authn module trusted_identity_claims output. + # Okta: externalIdentityClaim = internal client_id (sub == client_id) + # Entra ID: externalIdentityClaim = Service Principal Object ID trustedIdentityClaims: enabled: true - externalIdentityClaim: '{{ default .Values.global.INTERNAL_CLIENT_ID .Values.global.INTERNAL_SUBJECT_ID }}' + externalIdentityClaim: "" externalIdentityTypeClaim: "app" internalConnectionConfig: enabled: true @@ -1335,6 +1298,14 @@ flyte: server: security: useAuth: true + + # --- OIDC Authentication Configuration --- + # Configure your identity provider below. Set values directly or via + # your values overlay. For Union-managed deployments, the authn Terraform + # module generates this block automatically. + # + # Supported IdPs: Okta, Entra ID (Azure AD), Keycloak, Authentik. + # See unionai-docs for provider-specific setup guides. auth: httpAuthorizationHeader: "flyte-authorization" grpcAuthorizationHeader: "flyte-authorization" @@ -1342,26 +1313,64 @@ flyte: - "http://flyteadmin:80" - 'http://flyteadmin.{{ .Release.Namespace }}.svc.cluster.local:80' - 'https://{{ .Values.global.UNION_HOST }}' + appAuth: authServerType: "External" externalAuthServer: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - metadataUrl: '{{ default ".well-known/oauth-authorization-server" .Values.global.OIDC_METADATA_URL }}' - # allowedAudience defaults to ["https://{UNION_HOST}"]. - # Override via OIDC_ALLOWED_AUDIENCE global or values overlay. + # --- OIDC Issuer --- + # Okta example: "https://dev-123456.okta.com/oauth2/default" + # Entra ID example: "https://login.microsoftonline.com/{tenant-id}/v2.0" + baseUrl: "" + # Metadata discovery endpoint (relative to baseUrl). + # Okta: ".well-known/oauth-authorization-server" (default) + # Entra ID: ".well-known/openid-configuration" + metadataUrl: ".well-known/oauth-authorization-server" + # Allowed JWT audiences for access token validation. + # Default: ["https://{UNION_HOST}"]. + # Entra ID example: ["api://my-app-name", "f0b2667d-..."] allowedAudience: - 'https://{{ .Values.global.UNION_HOST }}' + + # --- Subject claim resolution (FAB-205) --- + # Ordered list of JWT claims to try for caller identity. + # Only needed for IdPs where client_credentials tokens omit "sub". + # Default: uses standard "sub" claim. + # Override in values overlay if your IdP requires fallback claims. + + # --- Identity type claim mapping --- + # Maps IdP-specific claims to internal identitytype. + # Okta: not needed (identitytype claim is native) + # Entra ID example (set in values overlay): + # identityTypeClaimsForApps: + # idtyp: ["app"] + thirdPartyConfig: flyteClient: - clientId: '{{ .Values.global.CLI_CLIENT_ID }}' + # --- CLI/SDK PKCE Client --- + # Okta example: "0oa7mno8pqr9stu0v1w2" + # Entra ID example: "3df10225-18a5-4636-b1ef-582e5a8ea21c" + clientId: "" redirectUri: "http://localhost:53593/callback" + # Resource scope for CLI/SDK and task pod authentication. + # Okta: ["all"] (default) + # Entra ID: ["api://my-app-name/.default"] scopes: - - '{{ default "all" .Values.global.OIDC_APP_SCOPE }}' - audience: '{{ default "" .Values.global.OIDC_APP_AUDIENCE }}' + - "all" + # Audience parameter for authorization requests. + # Okta: "" (not needed) + # Entra ID: "api://my-app-name" + audience: "" + userAuth: openId: - baseUrl: '{{ .Values.global.OIDC_BASE_URL }}' - clientId: '{{ .Values.global.OIDC_CLIENT_ID }}' + # --- Browser Login --- + # Okta example: "0oa1abc2def3ghi4j5k6" + # Entra ID example: "f0b2667d-5e99-45f2-ae4a-2ab47cb5fa12" + baseUrl: "" + clientId: "" + # Browser login scopes. + # Okta: ["profile", "openid", "offline_access"] (default) + # Entra ID: ["profile", "openid", "offline_access", "api://my-app/all"] scopes: - profile - openid