From 3ff2b549a83232b2822a256dd59a0ddb2be97dfb Mon Sep 17 00:00:00 2001 From: Nelson Araujo Date: Sat, 17 Jan 2026 20:24:08 -0800 Subject: [PATCH] Support for GCP control plane, single/dual cluster --- README-self-hosted-GCP-with-Terraform.md | 279 +++ charts/controlplane/out.yaml | 2032 ----------------- charts/controlplane/templates/service.yaml | 6 +- ...controlplane.selfhosted-intracluster.yaml} | 29 +- ...cp.dataplane.selfhosted-intracluster.yaml} | 12 +- 5 files changed, 315 insertions(+), 2043 deletions(-) create mode 100644 README-self-hosted-GCP-with-Terraform.md delete mode 100644 charts/controlplane/out.yaml rename charts/controlplane/{values.gcp.selfhosted-intracluster.yaml => values.gcp.controlplane.selfhosted-intracluster.yaml} (95%) rename charts/dataplane/{values.gcp.selfhosted-intracluster.yaml => values.gcp.dataplane.selfhosted-intracluster.yaml} (97%) diff --git a/README-self-hosted-GCP-with-Terraform.md b/README-self-hosted-GCP-with-Terraform.md new file mode 100644 index 00000000..108d4a01 --- /dev/null +++ b/README-self-hosted-GCP-with-Terraform.md @@ -0,0 +1,279 @@ +# Self-Hosted Union on GCP using Terraform + +This step-by-step tutorial aims to deploy a Union self-hosted (both control +plane and data plane) on GCP using Union's reference Terraform modules. + +> The customer is free to use any other infrastructure mechanism, be that their +> own Terraform modules or other means, as long as their choice of system +> produces the same output as the Terraform modules. +> +> This is critical because the instructions herein assume those resources exist +> and configured as such. You are welcome to perform all tasks manually and +> observe the full manual step-by-step defined in the respective [Control Plane +> GCP][manual-cp-gcp] and [Data Plane GCP][manual-dp-gcp] guides. + +## Resources Needed + +- **VPC**: networking details to run Union +- **GKE**: where Union will run. It can be the same cluster or distinct + clusters. +- **Cloud SQL (postgres)**: a database for CP to store job and run information. +- **Workload Identities**: used to allow the GKE cluster to assime the IAM roles + and acquire privileges to perform operations, i.e., access GCS +- **IAM service accounts**: the accounts used to perform privileged operations, + i.e., writing state to GCS +- **ScyllaDB**: a high-performant NoSQL store for dynamic state + +## Deploying Infrastructure Resources + +To deploy the infrastructure resources we will use the Terraform modules you +received. They are: + +- `infra`: Creates all control and data plane infrastructure resources + - `infra_ext`: An adapter to plug into existing infrastructure without + creating them. This is used to make your VPC and GKE compatible with the + modules below. +- `controlplane`: Creates all the control plane resources +- `dataplane`: Creates all the dataplane resources + +### Resources created by the Terraform modules + +- **VPC**: networking details to run Union +- **GKE**: where Union will run. It can be the same cluster or distinct + clusters, depending on the value of the variable + `dedicated_dataplane_cluster`. If `true`, two clusters will be created (or + referenced), and if `false` the same cluster is shared between CP and DP. +- **Cloud SQL (postgres)**: a database for CP to store job and run information. +- **Workload Identities**: used to allow the GKE cluster to assime the IAM roles + and acquire privileges to perform operations, i.e., access GCS +- **IAM service accounts**: the accounts used to perform privileged operations, + i.e., writing state to GCS + +### Deployment Instructions + +1. Unpack the Terraform modules +2. Choose from the examples, either `create-infra` or `already-created-infra`, + depending whether you want the module to create the VPC and GKE or not. + - Update the values within with your specific project details + - If to share the same cluster by CP and DP, set `dedicated_dataplane_cluster + = false`, otherwise set to `dedicated_dataplane_cluster = true`. +3. `terraform init` to pull the required providers +4. `terraform plan` and review the objects to be created +5. `terraform apply` to make it so + +At the end of this run you will have: + +- VPC created (optional) +- GKE created (optional) +- Postgres in Cloud SQL +- Postgres account information (loaded as secret in GKE) +- Self-signed certificates for the Control Plane (loaded as secret in GKE) +- Workload identities for Data Plane backend + +## Deploying Union + +### Deploying Control Plane + +As you used the Terraform module (or performed your own steps that produced the +same objects), we will be skipping all the manual steps listed in the [Control +Plane GCP][manual-cp-gcp] page, and move straight into `helm install`. + +#### Gathering Infra Details + +> We will make Terraform output the values file eventually. For now, please find +> these values from the output of the Terraform execution (or call `terraform +> output`) + +Mostly you will concentrate on the `global` section of the values file: + +| Value | Description | Source | +| ------------------------- | ----------------------------------------- | ---------------------------------------------------------------------- | +| `GCP_REGION` | The region the Control Plane is installed | `main.tf > module > infra > region` | +| `DB_HOST` | The IP address of the Postgres database | `terraform output controlplane > db > host` | +| `BUCKET_NAME` | Bucket used for system functions | `terraform output controlplane > gcs > flyte > id` | +| `ARTIFACTS_BUCKET_NAME` | Bucket to store artifacts | `terraform output controlplane > gcs > artifacts > id` | +| `ARTIFACT_IAM_ROLE_ARN` | The IAM role to access artifacts | `terraform output controlplane > service_accounts > artifacts > email` | +| `FLYTEADMIN_IAM_ROLE_ARN` | The IAM role to access system storage | `terraform output controlplane > service_accounts > flyte > email` | +| `UNION_ORG` | The name of the Union organization | `main.tf > locals > union_org` | +| `GOOGLE_PROJECT_ID` | The name of the GCP project | `main.tf > locals > project_id` | + +We will ignore for now, but will come back to it later, after we install the +Data Plane: + +| Value | Description | Source | +| -------------------- | --------------------------------------- | ------------------------------------------------ | +| `DATAPLANE_ENDPOINT` | The ingress endpoint for the data plane | Data Plane `EXTERNAL_IP` for its ingress service | + +> If you are using DNS entries for the ingress endpoints, and you know in +> advance the Data Plane ingress DNS, you can specify them now and skip updating +> this later. + +#### CP Deployment Instructions + +0. Ensure your current cluster is pointing to where you want to install the + Control Plane, i.e., execute `gcloud container clusters get-credentials` or + whichevever mechanism you use to make your cluster the default. + +1. Unpack the Helm charts + +2. Make a copy of the [GCP CP Self-Hosted values.yaml][gcp-cp-values] + - If you received a reference values from Union personnel, use that instead. + +3. Update the `values.yaml` from step 2 with your project and environment + specific information + +4. Load the Registry access secret to the cluster: + + kubectl create secret docker-registry union-registry-secret \ + --docker-server=registry.unionai.cloud \ + --docker-username='' \ + --docker-password='' \ + -n union-cp + +5. `helm install` the Control Plane module + + cd charts/controlplane + helm upgrade --install unionai-controlplane . \ + --namespace union-cp \ + --create-namespace \ + --values your-values.yaml \ + --timeout 15m + +6. Wait for a little bit: get a coffee, maybe a short walk? + +#### CP Deployment Verification + +Confirm all services are running: + + kubectl get pod -n union-cp + +You should get something like this: + + NAME READY STATUS RESTARTS AGE + authorizer-6f8f655467-l44mt 1/1 Running 0 33h + cacheservice-6979466f8c-6rn5g 1/1 Running 0 33h + cluster-c5597448-prvbq 1/1 Running 0 33h + controlplane-nginx-controller-857b568794-twfkr 1/1 Running 0 33h + dataproxy-6d484b7c45-rbgw9 1/1 Running 0 11h + executions-5d4fb97788-wgg6w 1/1 Running 0 33h + flyteadmin-74fbf5bbd9-x6lb5 1/1 Running 0 33h + flyteconsole-d6859d494-wnrxl 1/1 Running 0 33h + queue-78f8fb75f4-22qp8 1/1 Running 0 33h + run-scheduler-54667b6d96-w5z8p 1/1 Running 0 33h + scylla-dc1-rack1-0 4/4 Running 0 33h + scylla-dc1-rack1-1 4/4 Running 0 33h + scylla-dc1-rack1-2 4/4 Running 0 33h + unionconsole-55d946668-nlf7x 1/1 Running 0 33h + usage-5ddf757d6d-cjlr8 1/1 Running 0 33h + +At this point the control plane setup is complete. + +### Deploying Data Plane + +The process to deploy the data plane is very similar to the Control Plane. + +#### DP Gathering Infra Details + + | Value | Description | Source | + | --------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------- | + | `CLUSTER_NAME` | Name of the Data Plane | `terraform output dataplane > union > cluster_name` | + | `ORG_NAME` | Union organization | `terraform output dataplane > union > org` | + | `METADATA_BUCKET` | System bucket | `terraform output dataplane > gcs > metadata > name` | + | `FAST_REGISTRATION_BUCKET` | Fast registration bucket (can be the same of metadata) | `terraform output dataplane > gcs > fast_registration > name` | + | `GCP_REGION` | The region the Data Plane is installed | `main.tf > module > infra > region` | + | `GOOGLE_PROJECT_ID` | The name of the GCP project | `main.tf > locals > project_id` | + | `BACKEND_IAM_ROLE_ARN` | The role backend services will run | `terraform output dataplane > gcp > service_accounts > backend > email` | + | `WORKER_IAM_ROLE_ARN` | The role workers will run | `terraform output dataplane > gcp > service_accounts > worker > email` | + | `CONTROLPLANE_INTRA_CLUSTER_HOST` | | On CP GKE `get svc controlplane-nginx-controller` pick `EXTERNAL_IP` | + | `QUEUE_SERVICE_HOST` | | On CP GKE `get svc queue` pick `EXTERNAL_IP` | + | `FLYTEADMIN_ENDPOINT` | | On CP GKE `get svc flyteadmin` pick `EXTERNAL_IP` | + | `CACHESERVICE_ENDPOINT` | | On CP GKE `get svc cacheservice` pick `EXTERNAL_IP` | + +#### DP Deployment Instructions + +0. Ensure your current cluster is pointing to where you want to install the + Data Plane. _If you are not sharing the same cluster, note this should point + to the **data plane** cluster now._ + +1. Make a copy of the [GCP DP Self-Hosted values.yaml][gcp-dp-values] + - If you received a reference values from Union personnel, use that instead. + +2. Update the `values.yaml` from step 1 with your project and environment + specific information + +3. `helm install` the Control Plane module + + cd charts/dataplane + helm upgrade --install unionai-dataplane . \ + --namespace union \ + --create-namespace \ + --values your-values.yaml \ + --timeout 10m \ + --wait + + > It is important to deploy in the `union` namespace, so do not skip the `-n + > union` argument. The Workflow Identity IAM is configured to that + > namespace, and changing it just here will make things fail. + +4. Wait for a little bit: time for now coffee or walk? + +#### DP Deployment Verification + +Confirm all services are running: + + kubectl get pod -n union + +You should see something like this: + + NAME READY STATUS RESTARTS AGE + dataplane-nginx-controller-859754bb66-zxjgs 1/1 Running 0 12h + executor-6b9fbfb46d-bczbs 1/1 Running 0 12h + flytepropeller-54b98486b4-59qnw 1/1 Running 0 12h + flytepropeller-webhook-6fc47cd8fd-rf7nt 1/1 Running 0 12h + prometheus-operator-5cff9b5487-rb7nn 1/1 Running 0 12h + prometheus-union-operator-prometheus-0 2/2 Running 0 12h + syncresources-56d976c8-7s28f 1/1 Running 0 12h + union-operator-d8746c9f9-6c6lz 1/1 Running 0 12h + union-operator-proxy-5fd674b9dd-jp8vb 1/1 Running 0 12h + unionai-dataplane-fluentbit-572gt 1/1 Running 0 12h + unionai-dataplane-fluentbit-n4tbd 1/1 Running 0 12h + unionai-dataplane-fluentbit-qhknp 1/1 Running 0 12h + +#### Binding DP to CP + +> This step is only needed if you're not using DNS entries for the Data Plane +> ingress, or if you do but cannot predict it before installing the control +> plane. + +The Control Plane needs to reach out to the Data Plane to send work to it. +Therefore, we need to "teach" the Control Plane where to find the Data Plane. +That's accomplished by the Helm variable `DATAPLANE_ENDPOINT` in the Control +Plane Helm chart. + +1. Find the IP address (or DNS entry) for the ingress of the Data Plane, and + pick the `EXTERNAL_IP`: + + kubectl get svc dataplane-nginx-controller + +2. Update the variable in your `values.yaml` for the Control Plane + +3. Make sure you switch your Kubernetes context to point to the Control Plane + GKE. + +4. Run an upgrade of the Helm chart to propagate the value (by running the same + command as to install the Control Plane): + + cd charts/controlplane + helm upgrade --install unionai-controlplane . \ + --namespace union-cp \ + --create-namespace \ + --values your-values.yaml \ + --timeout 15m + +Once this complete you're done! Both the control plane and data plane are +successfully setup. + +[manual-cp-gcp]: https://github.com/unionai/helm-charts/blob/nelson/nav-gcp/charts/controlplane/SELFHOSTED_INTRA_CLUSTER_GCP.md +[manual-dp-gcp]: https://github.com/unionai/helm-charts/blob/nelson/nav-gcp/charts/dataplane/SELFHOSTED_INTRA_CLUSTER_GCP.md +[gcp-cp-values]: https://github.com/unionai/helm-charts/blob/nelson/nav-gcp/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +[gcp-dp-values]: https://github.com/unionai/helm-charts/blob/nelson/nav-gcp/charts/dataplane/values.gcp.selfhosted-intracluster.yaml diff --git a/charts/controlplane/out.yaml b/charts/controlplane/out.yaml deleted file mode 100644 index 01ab087d..00000000 --- a/charts/controlplane/out.yaml +++ /dev/null @@ -1,2032 +0,0 @@ -apiVersion: v1 -items: -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - creationTimestamp: "2025-12-31T02:01:18Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane - namespace: union-cp - resourceVersion: "1767147225349343015" - uid: 2d19fffd-9356-4aae-973c-3f85dcd39756 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteadmin - port: - number: 87 - path: /openapi - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /healthcheck - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /me - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 87 - path: /openapi/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /.well-known - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /.well-known/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /login - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /login/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /logout - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /logout/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /callback - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /callback/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /config - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /config/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /oauth2 - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /oauth2/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /auth - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /auth/* - pathType: ImplementationSpecific - - backend: - service: - name: usage - port: - number: 81 - path: /enqueue_metronome_request/v1 - pathType: ImplementationSpecific - - backend: - service: - name: usage - port: - number: 81 - path: /enqueue_metronome_request/v1/* - pathType: ImplementationSpecific - - backend: - service: - name: usage - port: - number: 81 - path: /enqueue_stripe_request/v1 - pathType: ImplementationSpecific - - backend: - service: - name: usage - port: - number: 81 - path: /enqueue_stripe_request/v1/* - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/auth-cache-key: $http_flyte_authorization$http_cookie - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - nginx.org/websocket-services: dataproxy-service - creationTimestamp: "2025-12-31T02:01:18Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-console-protected - namespace: union-cp - resourceVersion: "1767147225552191021" - uid: d0575e10-379c-42b0-9ef2-9a479d053ea4 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteconsole - port: - number: 80 - path: / - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /console - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /console/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /dashboard - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /dashboard/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /resources - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /resources/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /cost - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /cost/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /loading - pathType: ImplementationSpecific - - backend: - service: - name: flyteconsole - port: - number: 80 - path: /loading/* - pathType: ImplementationSpecific - - backend: - service: - name: unionconsole - port: - number: 80 - path: /v2 - pathType: ImplementationSpecific - - backend: - service: - name: unionconsole - port: - number: 80 - path: /v2/* - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - nginx.org/websocket-services: dataproxy-service - creationTimestamp: "2025-12-31T02:01:16Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-dataproxy - namespace: union-cp - resourceVersion: "1767147225160959009" - uid: beed73ae-a991-4b8c-b4a7-89ad62a7c7e5 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: dataproxy - port: - number: 80 - path: /data/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /data - pathType: Prefix - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/backend-protocol: GRPC - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - creationTimestamp: "2025-12-31T02:01:18Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-grpc - namespace: union-cp - resourceVersion: "1767147226555839004" - uid: ef1bfdd1-41fb-43e2-acc5-2e59dcb86a4f - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /grpc.health.v1.Health - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /grpc.health.v1.Health/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.AuthMetadataService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.AuthMetadataService/* - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/backend-protocol: GRPC - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - creationTimestamp: "2025-12-31T02:01:18Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-grpc-streaming - namespace: union-cp - resourceVersion: "1767147226350111011" - uid: f2d9c773-d1e6-4e0a-a88d-703109b10f96 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.WatchService/WatchExecutionStatusUpdates - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - nginx.org/websocket-services: dataproxy-service - creationTimestamp: "2025-12-31T02:01:17Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-protected - namespace: union-cp - resourceVersion: "1767147225755263000" - uid: 54cdb54c-e805-4cb7-a393-c86378243a1a - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /api - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /api/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /v1/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /cloudadmin - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 80 - path: /cloudadmin/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /actor - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /actor/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /agent - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /agent/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /dataplane - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /dataplane/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /spark-history-server - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /spark-history-server/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /api/v1/dataproxy - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 81 - path: /api/v1/dataproxy/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 81 - path: /app - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 81 - path: /app/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 81 - path: /apps - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 81 - path: /apps/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /cluster - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /cluster/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /clusterpool - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /clusterpool/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /clusterconfig - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /clusterconfig/* - pathType: ImplementationSpecific - - backend: - service: - name: organizations - port: - number: 81 - path: /org - pathType: ImplementationSpecific - - backend: - service: - name: organizations - port: - number: 81 - path: /org/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /managed_cluster - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 81 - path: /managed_cluster/* - pathType: ImplementationSpecific - - backend: - service: - name: authorizer - port: - number: 81 - path: /authorizer - pathType: ImplementationSpecific - - backend: - service: - name: authorizer - port: - number: 81 - path: /authorizer/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /oauth_app - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /oauth_app/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /users - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /users/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /roles - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /roles/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /policies - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /policies/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /identities - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 81 - path: /identities/* - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /echo - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /echo/* - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /execution - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /execution/* - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /workspace_registry - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /workspace_registry/* - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /workspace_instance - pathType: ImplementationSpecific - - backend: - service: - name: execution - port: - number: 81 - path: /workspace_instance/* - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/backend-protocol: GRPC - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - creationTimestamp: "2025-12-31T02:01:17Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-protected-grpc - namespace: union-cp - resourceVersion: "1767147225957231019" - uid: 3a6048e1-465d-4762-a8f9-f54a4d34e285 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.execution.ExecutionService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.execution.ExecutionService - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cluster.ClusterService/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cluster.ClusterService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 83 - path: /cloudidl.apikey.APIKeyService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 83 - path: /cloudidl.apikey.APIKeyService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.AppsService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.AppsService - pathType: ImplementationSpecific - - backend: - service: - name: organizations - port: - number: 80 - path: /cloudidl.org.OrgService/* - pathType: ImplementationSpecific - - backend: - service: - name: organizations - port: - number: 80 - path: /cloudidl.org.OrgService - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cloudaccounts.CloudAccountsService/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cloudaccounts.CloudAccountsService - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cluster.ManagedClusterService/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.cluster.ManagedClusterService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.UserService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.UserService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.RoleService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.RoleService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.PolicyService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.PolicyService - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.SelfServe/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.SelfServe - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.IdentityService/* - pathType: ImplementationSpecific - - backend: - service: - name: identity - port: - number: 80 - path: /cloudidl.identity.IdentityService - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.clusterpool.ClusterPoolService/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.clusterpool.ClusterPoolService - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.clusterconfig.ClusterConfigService/* - pathType: ImplementationSpecific - - backend: - service: - name: cluster - port: - number: 80 - path: /cloudidl.clusterconfig.ClusterConfigService - pathType: ImplementationSpecific - - backend: - service: - name: authorizer - port: - number: 80 - path: /cloudidl.authorizer.AuthorizerService/* - pathType: ImplementationSpecific - - backend: - service: - name: authorizer - port: - number: 80 - path: /cloudidl.authorizer.AuthorizerService - pathType: ImplementationSpecific - - backend: - service: - name: datacatalog - port: - number: 89 - path: /datacatalog.DataCatalog/* - pathType: ImplementationSpecific - - backend: - service: - name: datacatalog - port: - number: 89 - path: /datacatalog.DataCatalog - pathType: ImplementationSpecific - - backend: - service: - name: cacheservice - port: - number: 89 - path: /flyteidl.cacheservice.CacheService/* - pathType: ImplementationSpecific - - backend: - service: - name: cacheservice - port: - number: 89 - path: /flyteidl.cacheservice.CacheService - pathType: ImplementationSpecific - - backend: - service: - name: cacheservice - port: - number: 89 - path: /flyteidl.cacheservice.v2.CacheService/* - pathType: ImplementationSpecific - - backend: - service: - name: cacheservice - port: - number: 89 - path: /flyteidl.cacheservice.v2.CacheService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.actor.ActorEnvironmentService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.actor.ActorEnvironmentService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.agent.AgentService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.agent.AgentService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.secret.SecretService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.secret.SecretService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /flyteidl2.secret.SecretService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /flyteidl2.secret.SecretService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.support.SupportService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.clouddataproxy.CloudDataProxyService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.clouddataproxy.CloudDataProxyService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /flyteidl.service.DataProxyService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /flyteidl.service.DataProxyService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.logs.LogsService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.logs.LogsService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workspace.WorkspaceRegistryService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workspace.WorkspaceRegistryService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workspace.WorkspaceInstanceService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workspace.WorkspaceInstanceService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.RunService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.RunService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.InternalRunService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.InternalRunService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TranslatorService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TranslatorService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TaskService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TaskService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TriggerService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TriggerService/* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.QueueService - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.QueueService/* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.StateService - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.StateService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.RunService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.RunService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.TranslatorService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.TranslatorService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.task.TaskService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.task.TaskService/* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.QueueService - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.QueueService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.trigger.TriggerService - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.trigger.TriggerService/* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.StateService - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.StateService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.imagebuilder.ImageService - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.imagebuilder.ImageService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.app.AppService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.app.AppLogsService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.app.ReplicaService/* - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/backend-protocol: GRPC - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - creationTimestamp: "2025-12-31T02:01:17Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-protected-grpc-streaming - namespace: union-cp - resourceVersion: "1767147226149727024" - uid: 2bb4bb0b-5ff5-44bd-90d9-e5caafa5fe97 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.AdminService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.AdminService/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.WatchService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.WatchService/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /cloudidl.cloudadmin.CloudAdminService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /cloudidl.cloudadmin.CloudAdminService/* - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.IdentityService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.IdentityService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.echo.EchoService/* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.echo.EchoService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.SignalService - pathType: ImplementationSpecific - - backend: - service: - name: flyteadmin - port: - number: 81 - path: /flyteidl.service.SignalService/* - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.actor.ActorEnvironmentService/Stream* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.execution.ExecutionService/GetExecutionOperation - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.RunLogsService/TailLogs - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.RunService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.InternalRunService/Record* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.InternalRunService/Update* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workflow.TaskService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.LeaseService/Heartbeat - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.QueueService/Heartbeat - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.StateService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.QueueService/StreamLeases - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /cloudidl.workflow.LeaseService/StreamLeases - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.RunLogsService/TailLogs - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.workflow.RunService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /flyteidl2.task.TaskService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.QueueService/Heartbeat - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.StateService/Watch* - pathType: ImplementationSpecific - - backend: - service: - name: queue - port: - number: 80 - path: /flyteidl2.workflow.QueueService/StreamLeases - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.logs.LogsService/TailTaskExecutionLogs - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.workspace.WorkspaceInstanceService/WatchWorkspaceInstances - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.app.AppService/Watch - pathType: ImplementationSpecific - - backend: - service: - name: executions - port: - number: 80 - path: /cloudidl.app.AppService/Lease - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.app.AppLogsService/TailLogs - pathType: ImplementationSpecific - - backend: - service: - name: dataproxy - port: - number: 80 - path: /cloudidl.app.ReplicaService/WatchReplicas - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - nginx.ingress.kubernetes.io/use-regex: "true" - nginx.org/websocket-services: dataproxy-service - creationTimestamp: "2025-12-31T02:01:17Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-usage - namespace: union-cp - resourceVersion: "1767147226751407005" - uid: f6810bcc-27a5-467c-991c-0d3fc91ad4f8 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: usage - port: - number: 81 - path: /usage/api/v1(/(?!custom_measures_names|measure_group|measure_groups|billable_measures|billing_info|report_billable_usage|customer_credits|checkout_session).*|$) - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -- apiVersion: networking.k8s.io/v1 - kind: Ingress - metadata: - annotations: - meta.helm.sh/release-name: unionai-controlplane - meta.helm.sh/release-namespace: union-cp - nginx.ingress.kubernetes.io/app-root: /v2 - nginx.ingress.kubernetes.io/backend-protocol: GRPC - nginx.ingress.kubernetes.io/force-ssl-redirect: "false" - nginx.ingress.kubernetes.io/limit-rps: "100" - nginx.ingress.kubernetes.io/proxy-body-size: 6m - nginx.ingress.kubernetes.io/proxy-buffer-size: 32k - nginx.ingress.kubernetes.io/proxy-buffers: 4 32k - nginx.ingress.kubernetes.io/proxy-cookie-domain: ~^ .$host - nginx.ingress.kubernetes.io/server-snippet: | - client_header_timeout 604800; - client_body_timeout 604800; - # Increasing the default configuration from - # client_header_buffer_size 1k; - # large_client_header_buffers 4 8k; - # to default of 16k and 32k for large buffer sizes. These sizes are chosen as a short term mediation until we can collect data to reason - # about expected header sizs (PE-1101). - # Historically, we have seen is with the previous 8k max buffer size , the auth endpoint of /me would throw 400 Bad request and due to this ingress controller - # threw a 500 as it doesn't expect this status code on auth request expected range : 200 <= authcall.status(i.e status of /me call) <=300 - # Code link for ref : https://github.com/nginx/nginx/blob/e734df6664e70f118ca3140bcef6d4f1750fa8fa/src/http/modules/ngx_http_auth_request_module.c#L170-L179 - # Now the main reason we have seen 400 bad request is large size of the cookies which contribute to the header size. - # We should keep reducing the size of what headers are being sent meanwhile we increase this size to mitigate the long header issue. - client_header_buffer_size 16k; - large_client_header_buffers 64 32k; - nginx.ingress.kubernetes.io/service-upstream: "true" - nginx.ingress.kubernetes.io/use-regex: "true" - creationTimestamp: "2025-12-31T02:01:16Z" - generation: 1 - labels: - app.kubernetes.io/managed-by: Helm - name: controlplane-usage-grpc - namespace: union-cp - resourceVersion: "1767147224955919015" - uid: a0dd692c-516c-4a39-9c9c-9f01a0740e10 - spec: - ingressClassName: controlplane - rules: - - http: - paths: - - backend: - service: - name: usage - port: - number: 80 - path: /cloudidl.usage.UsageService(/(?!GetCustomMeasuresNames|GetMeasureGroup|GetMeasureGroups|GetBillableMeasures|GetBillingInfo|ReportBillableUsage|ReportServerlessBillableUsage|CreateCustomer|AttachBillingPlanToCustomer|GetCustomerCredits|EnqueueMetronomeRequest|EnqueueStripeRequest|GetOrgCheckoutSession).*|$) - pathType: ImplementationSpecific - tls: - - hosts: - - localhost - - controlplane-nginx-controller.union-cp.svc.cluster.local - secretName: controlplane-tls-cert - status: - loadBalancer: - ingress: - - ip: 34.118.225.101 -kind: List -metadata: - resourceVersion: "" diff --git a/charts/controlplane/templates/service.yaml b/charts/controlplane/templates/service.yaml index 87cabf1b..f72175bb 100644 --- a/charts/controlplane/templates/service.yaml +++ b/charts/controlplane/templates/service.yaml @@ -9,8 +9,12 @@ metadata: name: {{ include "unionai.fullname" $service }} labels: {{- include "unionai.labels" $service | nindent 4 }} -spec: {{- $svc := include "unionai.service" $service | fromYaml }} + {{- if $svc.annotations }} + annotations: + {{- toYaml $svc.annotations | nindent 4 }} + {{- end }} +spec: type: {{ $svc.type | default "ClusterIP" }} ports: - name: grpc diff --git a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml b/charts/controlplane/values.gcp.controlplane.selfhosted-intracluster.yaml similarity index 95% rename from charts/controlplane/values.gcp.selfhosted-intracluster.yaml rename to charts/controlplane/values.gcp.controlplane.selfhosted-intracluster.yaml index 175c0155..df3e3893 100644 --- a/charts/controlplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/controlplane/values.gcp.controlplane.selfhosted-intracluster.yaml @@ -267,7 +267,10 @@ flyte: iam.gke.io/gcp-service-account: "{{ .Values.global.FLYTEADMIN_IAM_ROLE_ARN }}" imagePullSecrets: - name: union-registry-secret - + service: + type: LoadBalancer + annotations: + networking.gke.io/load-balancer-type: "Internal" flytescheduler: image: # flyte-core subchart doesn't render templates, must use hardcoded repository @@ -306,6 +309,10 @@ flyte: iam.gke.io/gcp-service-account: "{{ .Values.global.FLYTEADMIN_IAM_ROLE_ARN }}" imagePullSecrets: - name: union-registry-secret + service: + type: LoadBalancer + annotations: + networking.gke.io/load-balancer-type: "Internal" # ---------------------------------------------------------------------------- # SECTION 6: Ingress Configuration @@ -332,9 +339,11 @@ ingress-nginx: controller: service: - # ClusterIP for intra-cluster only (no external access) - # Change to LoadBalancer or NodePort for external access - type: ClusterIP + # Load balancer for access to API and UI + # Change to ClusterIP if no external cluster access needed + type: LoadBalancer + annotations: + networking.gke.io/load-balancer-type: "Internal" ports: http: 80 https: 443 @@ -405,6 +414,17 @@ services: endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' insecure: true + + # Queue service configuration + queue: + service: + type: LoadBalancer + annotations: + networking.gke.io/load-balancer-type: "Internal" + grpcport: 80 + httpport: 81 + debugport: 82 + # ---------------------------------------------------------------------------- # SECTION 9: ScyllaDB Configuration # ---------------------------------------------------------------------------- @@ -437,6 +457,5 @@ scylla: parameters: type: pd-standard - scylla-operator: enabled: true diff --git a/charts/dataplane/values.gcp.selfhosted-intracluster.yaml b/charts/dataplane/values.gcp.dataplane.selfhosted-intracluster.yaml similarity index 97% rename from charts/dataplane/values.gcp.selfhosted-intracluster.yaml rename to charts/dataplane/values.gcp.dataplane.selfhosted-intracluster.yaml index 8f199a2b..70c5d136 100644 --- a/charts/dataplane/values.gcp.selfhosted-intracluster.yaml +++ b/charts/dataplane/values.gcp.dataplane.selfhosted-intracluster.yaml @@ -200,7 +200,7 @@ config: admin: # Flyteadmin endpoint (control plane admin service) # Example: "flyteadmin.union-cp.svc.cluster.local:81" - endpoint: '{{ .Values.global.FLYTEADMIN_ENDPOINT }}' + endpoint: 'dns:///{{ .Values.global.FLYTEADMIN_ENDPOINT }}' # Use insecure (non-TLS) connection for intra-cluster HTTP communication insecure: true @@ -217,7 +217,7 @@ config: # Control plane host for catalog service # Example: "dns:///controlplane-nginx-controller.union-cp.svc.cluster.local" - cache-endpoint: '{{ .Values.global.CACHESERVICE_ENDPOINT }}' + cache-endpoint: 'dns:///{{ .Values.global.CACHESERVICE_ENDPOINT }}' endpoint: "" # Use insecure connection for intra-cluster communication @@ -316,9 +316,11 @@ ingress-nginx: controller: service: - # Use ClusterIP for intra-cluster only (no external LoadBalancer) - # Change to LoadBalancer or NodePort for external access - type: "ClusterIP" + # Load balancer for access to API and UI + # Change to ClusterIP if no external cluster access needed + type: LoadBalancer + annotations: + networking.gke.io/load-balancer-type: "Internal" ports: http: 80 https: 443