diff --git a/tests/generated/dataplane.baremetal-custom-s3.yaml b/tests/generated/dataplane.baremetal-custom-s3.yaml new file mode 100644 index 00000000..a6700606 --- /dev/null +++ b/tests/generated/dataplane.baremetal-custom-s3.yaml @@ -0,0 +1,2765 @@ +--- +# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +automountServiceAccountToken: true +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: clustersync-system + namespace: union +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: executor + namespace: union + labels: + app: executor +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-pod-webhook + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/templates/common/auth-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: union-secret-auth + namespace: union +type: Opaque +data: + # TODO(rob): update or configure operator to use client_secret like all the other components. + app_secret: ZHVtbXktc2VjcmV0LXZhbHVl + client_secret: ZHVtbXktc2VjcmV0LXZhbHVl +--- +# Source: dataplane/templates/common/cluster-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: operator-cluster-name +type: Opaque +data: + cluster_name: dW5pb24tYWNtZQ== +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +# prevent duplicate from propeller + +# Create an empty secret that the first propeller pod will populate +apiVersion: v1 +kind: Secret +metadata: + name: union-pod-webhook #prevent name collision with flyte oss + namespace: union +type: Opaque +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-clusterresourcesync-config + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + cluster_resources.yaml: | + cluster_resources: + clusterName: 'union-acme' + customData: + - production: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - staging: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + - development: + - projectQuotaCpu: + value: "4096" + - projectQuotaMemory: + value: 2Ti + - projectQuotaNvidiaGpu: + value: "256" + - defaultUserRoleKey: + value: 'eks.amazonaws.com/role-arn' + - defaultUserRoleValue: + value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + refreshInterval: 5m + standaloneDeployment: true + templatePath: /etc/flyte/clusterresource/templates + clusterResourcesPrivate: + app: + isServerless: false + union: + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'acme-union-acme-operator' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + connection: + host: dns:///acme.eu-west-2.unionai.cloud + admin.yaml: | + admin: + clientId: 'acme-union-acme-operator' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///acme.eu-west-2.unionai.cloud + insecure: false + event: + capacity: 1000 + rate: 500 + type: admin + domain.yaml: | + domains: + - id: development + name: development + - id: staging + name: staging + - id: production + name: production + namespace_config.yaml: | + namespace_mapping: + template: '{{`{{ project }}`}}' + clusters.yaml: | + clusters: + clusterConfigs: [] + labelClusterMap: {} + logger.yaml: | + logger: + level: 4 + show-source: true + namespace_mapping.yaml: | + namespace_mapping: + template: '{{ project }}' +--- +# Source: dataplane/templates/clusterresourcesync/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: clusterresource-template + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + a_namespace.yaml: | + apiVersion: v1 + kind: Namespace + metadata: + name: {{ namespace }} + labels: + union.ai/namespace-type: flyte + spec: + finalizers: + - kubernetes + + b_default_service_account.yaml: | + apiVersion: v1 + kind: ServiceAccount + metadata: + name: default + namespace: {{ namespace }} + annotations: + {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + + c_project_resource_quota.yaml: | + apiVersion: v1 + kind: ResourceQuota + metadata: + name: project-quota + namespace: {{ namespace }} + spec: + hard: + limits.cpu: {{ projectQuotaCpu }} + limits.memory: {{ projectQuotaMemory }} + requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | + debug = false + + [log] + format = "text" + + [worker.oci] + enabled = true + snapshotter = "auto" + gc = true + max-parallelism = 0 + + # Should not be used if Policies are defined + gckeepstorage = "10%" + [[worker.oci.gcpolicy]] + # Remove COPY/ADD and git checkout files + keepBytes = "10%" + keepDuration = "24h" + filters = [ "type==source.local", "type==source.git.checkout" ] + [[worker.oci.gcpolicy]] + # Remove locally cached image layers after it's unused for 24 hours + keepBytes = "10%" + keepDuration = "24h" + filters = [ "regular" ] + [[worker.oci.gcpolicy]] + # Remove shared cache mounts. E.G. Pip cache + keepBytes = "10%" + keepDuration = "72h" + filters = [ "type==exec.cachemount" ] + [[worker.oci.gcpolicy]] + # Remove everything else to keep the cache size under total file system limit + all = true + keepBytes = "80%" +--- +# Source: dataplane/templates/nodeexecutor/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: executor + namespace: union + labels: + app: executor +data: + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + kubernetes-enabled: false + templates: + - displayName: Grafana Logs + templateUris: + - https://grafana-infra.example.internal/explore?schemaVersion=1&panes=%7B%22pane%22%3A%7B%22datasource%22%3A%22af4lu074kfcaoc%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bnamespace%3D%5C%22{{ .namespace }}%5C%22%2Cpod%3D%5C%22{{ .podName }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22af4lu074kfcaoc%22%7D%2C%22editorMode%22%3A%22code%22%2C%22direction%22%3A%22backward%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22{{ .podUnixStartTime }}000%22%2C%22to%22%3A%22now%22%7D%2C%22panelsState%22%3A%7B%22logs%22%3A%7B%22columns%22%3A%7B%220%22%3A%22Line%22%7D%2C%22visualisationType%22%3A%22logs%22%2C%22labelFieldName%22%3A%22labels%22%2C%22refId%22%3A%22A%22%7D%7D%2C%22compact%22%3Afalse%7D%7D&orgId=1 + enabled_plugins.yaml: | + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + config.yaml: | + executor: + cluster: 'union-acme' + evaluatorCount: 64 + maxActions: 2000 + organization: 'acme' + unionAuth: + injectSecret: true + secretName: EAGER_API_KEY + workerName: worker1 + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 4096 + gpu: 256 + memory: 2Ti + union: + connection: + host: dns:///acme.eu-west-2.unionai.cloud + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'acme-union-acme-operator' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + admin: + clientId: 'acme-union-acme-operator' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///acme.eu-west-2.unionai.cloud + insecure: false + authorizer: + type: noop + catalog-cache: + cache-endpoint: dns:///acme.eu-west-2.unionai.cloud + endpoint: dns:///acme.eu-west-2.unionai.cloud + insecure: false + type: fallback + use-admin-auth: true + logger: + level: 4 + show-source: true + namespace_mapping: + template: '{{ project }}' + sharedService: + metrics: + scope: 'executor:' + security: + allowCors: true + allowLocalhostAccess: true + allowedHeaders: + - Content-Type + allowedOrigins: + - '*' + secure: false + useAuth: false + propeller: + node-config: + disable-input-file-writes: true + persist-cache-status: true + plugins: + fasttask: + additional-worker-args: + - --last-ack-grace-period-seconds + - "120" + callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + grace-period-status-not-found: 2m + ioutils: + remoteFileOutputPaths: + deckFilename: report.html + k8s: + disable-inject-owner-references: true + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + storage: + container: "union" + container: union + stow: + config: + access_key_id: dummy-secret-value + auth_type: accesskey + disable_force_path_style: "true" + disable_ssl: false + endpoint: https://s3.example.com + region: RNO2A + secret_key: dummy-secret-value + kind: s3 + type: stow + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/templates/operator/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + config.yaml: | + union: + connection: + host: dns:///acme.eu-west-2.unionai.cloud + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'acme-union-acme-operator' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + sharedService: + features: + gatewayV2: true + port: 8081 + authorizer: + type: noop + operator: + enabled: true + enableTunnelService: true + tunnel: + enableDirectToAppIngress: false + deploymentToRestart: union-operator-proxy + apps: + enabled: 'false' + syncClusterConfig: + enabled: false + clusterId: + organization: 'acme' + clusterData: + appId: 'acme-union-acme-operator' + bucketName: 'union' + bucketRegion: 'us-east-1' + cloudHostName: 'acme.eu-west-2.unionai.cloud' + gcpProjectId: '' + metadataBucketPrefix: 's3://union' + userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + userRoleKey: 'eks.amazonaws.com/role-arn' + # -- storageType is only used when syncClusterConfig is enabled. It is intentionally disabled and it should not be used. + storageType: custom + customStorageConfig: | + container: union + stow: + config: + access_key_id: dummy-secret-value + auth_type: accesskey + disable_force_path_style: "true" + disable_ssl: false + endpoint: https://s3.example.com + region: RNO2A + secret_key: dummy-secret-value + kind: s3 + type: stow + collectUsages: + enabled: true + billableUsageCollector: + enabled: true + dependenciesHeartbeat: + prometheus: + endpoint: 'http://union-operator-prometheus:80/-/healthy' + proxy: + endpoint: 'http://union-operator-proxy:10254' + imageBuilder: + enabled: true + executionNamespaceLabels: + union.ai/namespace-type: flyte + referenceConfigmapName: union-operator + targetConfigMapName: "build-image-config" + proxy: + imageBuilderConfig: + authenticationType: 'noop' + defaultRepository: 'https://ghcr.io/acme-corp/acme/union' + persistedLogs: + objectStore: + pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} + prefix: persisted-logs + sourceType: ObjectStore + smConfig: + enabled: 'true' + k8sConfig: + namespace: 'union' + type: 'K8s' + logger.yaml: | + logger: + level: 4 + show-source: true + config-overrides.yaml: | + cache: + identity: + enabled: false + storage.yaml: | + storage: + container: "union" + container: union + stow: + config: + access_key_id: dummy-secret-value + auth_type: accesskey + disable_force_path_style: "true" + disable_ssl: false + endpoint: https://s3.example.com + region: RNO2A + secret_key: dummy-secret-value + kind: s3 + type: stow + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + fast_registration_storage.yaml: | + fastRegistrationStorage: + container: "union" + container: union + stow: + config: + access_key_id: dummy-secret-value + auth_type: accesskey + disable_force_path_style: "true" + disable_ssl: false + endpoint: https://s3.example.com + region: RNO2A + secret_key: dummy-secret-value + kind: s3 + type: stow + image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + image-builder.default-repository: "https://ghcr.io/acme-corp/acme/union" + image-builder.authentication-type: "noop" +--- +# Source: dataplane/templates/propeller/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flyte-propeller-webhook-config + namespace: union +data: + core.yaml: | + + + webhook: + certDir: /etc/webhook/certs + embeddedSecretManagerConfig: + imagePullSecrets: + enabled: true + k8sConfig: + namespace: 'union' + type: 'K8s' + listenPort: '9443' + localCert: true + secretManagerTypes: + - Embedded + - K8s + secretName: union-pod-webhook + serviceName: union-pod-webhook + servicePort: '443' +--- +# Source: dataplane/charts/opencost/templates/clusterrole.yaml +# Cluster role giving opencost to get, list, watch required resources +# No write permissions are required +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] + resources: + - configmaps + - deployments + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - deployments + - daemonsets + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - get + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: clustersync-resource +rules: + - apiGroups: + - "" + - rbac.authorization.k8s.io + resources: + - configmaps + - namespaces + - pods + - resourcequotas + - roles + - rolebindings + - secrets + - services + - serviceaccounts + - clusterrolebindings + verbs: + - '*' +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-union-pod-webhook + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + # Allow Access to all resources under flyte.lyft.com + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - nonResourceURLs: + - /metrics + verbs: + - get +--- +# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: opencost + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: opencost +subjects: + - kind: ServiceAccount + name: opencost + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: clustersync-resource +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: clustersync-resource +subjects: + - kind: ServiceAccount + name: clustersync-system + namespace: union +--- +# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: clustersync-auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: + - kind: ServiceAccount + name: clustersync-system + namespace: union +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-executor +subjects: +- kind: ServiceAccount + name: executor + namespace: union +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +# Create a binding from Role -> ServiceAccount +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-union-pod-webhook + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: union-union-pod-webhook +subjects: + - kind: ServiceAccount + name: union-pod-webhook + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: proxy-system +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + - deployments + verbs: + - get + - list + - watch + - create + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system-secret +subjects: + - kind: ServiceAccount + name: proxy-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator-system +subjects: + - kind: ServiceAccount + name: operator-system + namespace: union +--- +# Source: dataplane/charts/opencost/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + selector: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + type: "ClusterIP" + ports: + - name: http + port: 9003 + targetPort: 9003 +--- +# Source: dataplane/templates/imagebuilder/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 1234 + targetPort: tcp + protocol: TCP + name: tcp + selector: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/nodeexecutor/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-dataplane-executor + labels: + app: executor +spec: + type: ClusterIP + ports: + - port: 15605 + targetPort: 15605 + protocol: TCP + name: fasttask + selector: + app: executor +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + projectcontour.io/upstream-protocol.h2c: grpc +spec: + selector: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: https + protocol: TCP + port: 443 + targetPort: 9443 + - name: debug + protocol: TCP + port: 10254 + targetPort: 10254 +--- +# Source: dataplane/templates/operator/service-proxy.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-proxy + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/operator/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/opencost/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: opencost + namespace: union + labels: + helm.sh/chart: opencost-1.42.0 + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "1.111.0" + app.kubernetes.io/part-of: opencost + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: opencost + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: opencost + containers: + - name: opencost + image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 + imagePullPolicy: IfNotPresent + args: + ports: + - containerPort: 9003 + name: http + resources: + limits: + cpu: 1000m + memory: 4Gi + requests: + cpu: 500m + memory: 1Gi + startupProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 30 + livenessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 20 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 9003 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + env: + - name: LOG_LEVEL + value: info + - name: CUSTOM_COST_ENABLED + value: "false" + - name: KUBECOST_NAMESPACE + value: union + - name: API_PORT + value: "9003" + - name: PROMETHEUS_SERVER_ENDPOINT + value: "http://union-operator-prometheus.union.svc:80/prometheus" + - name: CLUSTER_ID + value: "default-cluster" + - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS + value: "15" + - name: CLOUD_COST_ENABLED + value: "false" + - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL + value: "6" + - name: CLOUD_COST_REFRESH_RATE_HOURS + value: "6" + - name: CLOUD_COST_QUERY_WINDOW_DAYS + value: "7" + - name: CLOUD_COST_RUN_WINDOW_DAYS + value: "3" + # Add any additional provided variables +--- +# Source: dataplane/templates/clusterresourcesync/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: syncresources + namespace: union + labels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "2a4630966a6ebcfb81c2dc998659a64e8844e1d2ae84016711a90289d0d3e06" + + labels: + + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - clusterresource + - --config + - /etc/flyte/config/*.yaml + - clusterresource + - run + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: "IfNotPresent" + name: sync-cluster-resources + resources: + limits: + cpu: "1" + memory: 500Mi + requests: + cpu: 500m + memory: 100Mi + volumeMounts: + - name: auth + mountPath: /etc/union/secret + - name: resource-templates + mountPath: /etc/flyte/clusterresource/templates + - name: config-volume + mountPath: /etc/flyte/config + ports: + - containerPort: 10254 + serviceAccountName: clustersync-system + volumes: + - configMap: + name: clusterresource-template + name: resource-templates + - configMap: + name: flyte-clusterresourcesync-config + name: config-volume + - name: auth + secret: + secretName: union-secret-auth +--- +# Source: dataplane/templates/imagebuilder/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + container.apparmor.security.beta.kubernetes.io/buildkit: unconfined + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + spec: + containers: + - name: "buildkit" + image: "moby/buildkit:buildx-stable-1-rootless" + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - mountPath: /home/user/.local/share/buildkit + name: buildkitd + - mountPath: /etc/buildkit + name: buildkit-config + args: + - --config + - /etc/buildkit/buildkitd.toml + - --addr + - unix:///run/user/1000/buildkit/buildkitd.sock + - --addr + - tcp://0.0.0.0:1234 + - --oci-worker-no-process-sandbox + ports: + - name: tcp + containerPort: 1234 + protocol: TCP + readinessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + livenessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + securityContext: + seccompProfile: # Needs Kubernetes >= 1.19 + type: Unconfined + runAsUser: 1000 + runAsGroup: 1000 + resources: + requests: + cpu: 1 + ephemeral-storage: 20Gi + memory: 1Gi + volumes: + - name: buildkitd + emptyDir: {} + - configMap: + name: union-operator-buildkit + name: buildkit-config + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: dataplane/templates/nodeexecutor/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executor + namespace: union + labels: + app: executor +spec: + replicas: 1 + selector: + matchLabels: + app: executor + template: + metadata: + annotations: + configChecksum: "c2db488297a3a7b7dbaaafdb87fd06ab53755f1d8f278b8c1d2c0df9c47d25e" + labels: + + app: executor + spec: + securityContext: + fsGroup: 1337 + serviceAccountName: executor + volumes: + - name: config-volume + configMap: + name: executor + - name: secret-volume + secret: + secretName: union-secret-auth + - name: auth + secret: + secretName: union-secret-auth + containers: + - name: executor + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: IfNotPresent + command: + - executor + - serve + - --config + - /etc/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: metrics + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + resources: + limits: + cpu: "4" + memory: "8Gi" + requests: + cpu: "1" + memory: "1Gi" + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: secret-volume + mountPath: /etc/union/secret + - name: auth + mountPath: /etc/secrets/ +--- +# Source: dataplane/templates/nodeexecutor/webhook.yaml +# Create the actual deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + + app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + configChecksum: "c2eae24d5c6dab188e17e69073e32f53362cc05825a11857bc7919316944f59" + + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: + type: spc_t + serviceAccountName: union-pod-webhook + initContainers: + - name: generate-secrets + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - init-certs + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + - name: webhook-certs + mountPath: /etc/webhook/certs + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + containers: + - name: webhook + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + ports: + - containerPort: 9443 + - containerPort: 10254 + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + readOnly: true + - name: webhook-certs + mountPath: /etc/webhook/certs + readOnly: true + volumes: + - name: config-volume + configMap: + name: flyte-propeller-webhook-config + - name: webhook-certs + emptyDir: {} +--- +# Source: dataplane/templates/operator/deployment-proxy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-proxy + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "311737cdd59c75ac9e372353fecc4f9640f7a1216c46256b63a68ff3a4dfc35" + + labels: + + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + volumes: + - name: config-volume + projected: + sources: + - configMap: + name: union-operator + - configMap: + name: flyte-clusterresourcesync-config + - name: secret-volume + secret: + secretName: union-secret-auth + serviceAccountName: proxy-system + securityContext: + {} + containers: + - name: operator-proxy + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + args: + - operator + - proxy + - --config + - /etc/union/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: connect + containerPort: 8080 + protocol: TCP + - name: grpc + containerPort: 8081 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + - name: "tunnel" + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: IfNotPresent + args: + - cloudflared + - tunnel + - --no-autoupdate + - run + - --token + - $(TUNNEL_TOKEN) + env: + - name: TUNNEL_TOKEN + valueFrom: + secretKeyRef: + name: union-secret-auth + key: tunnel_token + optional: true + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi +--- +# Source: dataplane/templates/operator/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "311737cdd59c75ac9e372353fecc4f9640f7a1216c46256b63a68ff3a4dfc35" + + labels: + + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + serviceAccountName: operator-system + securityContext: + {} + volumes: + - name: config-volume + configMap: + name: union-operator + - name: secret-volume + secret: + secretName: union-secret-auth + containers: + - name: operator + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.2.10" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + args: + - operator + - serve + - --config + - /etc/union/config/*.yaml + - --operator.clusterId.name + - "$(CLUSTER_NAME)" + - --operator.tunnel.k8sSecretName + - union-secret-auth + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: union-opencost-rules + namespace: union + labels: + release: release-name +spec: + groups: + - name: cost_calculations_15s + interval: 15s + rules: + - record: pod_gpu_allocation + expr: | + sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) + - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ) + ) + - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( + label_replace( + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps + "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup + ), + "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key + ) + ) + - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_workspace_name, label_entity_id)( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces + "label_entity_id", "$1", "label_node_id", "(.*)" # join key + ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels + ) + ) + - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity + # First, calculate the allocated memory for each pod + max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory + ( + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} + ) + ) + or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, calculate the allocated cpu for each pod + max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu + ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity + # First, calculate the used memory for each pod + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:mem_usage_bytes_total_per_node:sum + ) + - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:cpu_usage_per_node:sum + ) + - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity + expr: | + avg by (label_entity_type, label_domain, label_project, label_entity_id) ( + # First, grab the SM occupancy for each pod + max by (namespace, pod) ( + DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) + expr: | + entity_id:sm_occupancy:avg + * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum + - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. + expr: | + label_replace( + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:allocated_mem_cost:sum + or + entity_id:allocated_cpu_cost:sum + or + entity_id:allocated_gpu_cost:sum + ), + "type", "allocated", "", "" # add type info + ) + - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) + expr: | + label_replace( + sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) + # Start with each execution's and app's allocated cost per node + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity + / on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts + ) + # Then multiply by the overhead cost per node + * on (node) group_left() ( + # To calculate overhead, start with the true cost of running each node + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes + * on (node) max by (node) ( + node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts + ) * (15 / 3600) # convert hourly cost to 15-secondly cost + # Then subtract out the total allocated cost on each node + - on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + ) + ) + ), + "type", "overhead", "", "" # add type info + ) + - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) + expr: | + label_replace( + sum by (label_domain, label_project, label_entity_id, label_entity_type) ( + entity_id:allocated_cost:sum + or + entity_id:overhead_cost:sum + ), + "type", "total", "", "" # add type info + ) + - record: node:total_cost:sum # Total cost of all nodes + expr: | + sum ( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost + ) + - record: node_type:total_cost:sum # Total cost of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label + ) + - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node, node_type)( # dedupe + label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel + ) + ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics + - name: cost_rollup_15m + interval: 15m + rules: + - record: execution_info15m + expr: | + max_over_time(execution_info[15m:15s]) + - record: app_info15m + expr: | + max_over_time(app_info[15m:15s]) + - record: workspace_info15m + expr: | + max_over_time(workspace_info[15m:15s]) + - record: entity_id:allocated_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) + - record: entity_id:used_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) + - record: entity_id:allocated_cpu:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) + - record: entity_id:used_cpu:sum15m + expr: | + sum_over_time(entity_id:used_cpu:sum[15m:15s]) + - record: entity_id:weighted_sm_occupancy:sum15m + expr: | + sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) + - record: entity_id:gpu_count:sum15m + expr: | + sum_over_time(entity_id:gpu_count:sum[15m:15s]) + - record: entity_id:allocated_mem_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) + - record: entity_id:allocated_cpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) + - record: entity_id:allocated_gpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) + - record: entity_id:allocated_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cost:sum[15m:15s]) + - record: entity_id:overhead_cost:sum15m + expr: | + sum_over_time(entity_id:overhead_cost:sum[15m:15s]) + - record: entity_id:total_cost:sum15m + expr: | + sum_over_time(entity_id:total_cost:sum[15m:15s]) + - record: node:total_cost:sum15m + expr: | + sum_over_time(node:total_cost:sum[15m:15s]) + - record: node_type:total_cost:sum15m + expr: | + sum_over_time(node_type:total_cost:sum[15m:15s]) + - record: node_type:uptime_hours:sum15m + expr: | + sum_over_time(node_type:uptime_hours:sum[15m:15s]) +--- +# Source: dataplane/templates/monitoring/servicemonitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: cost + namespace: union + labels: + release: release-name +spec: + selector: + matchLabels: + app.kubernetes.io/name: opencost + namespaceSelector: + matchNames: + - "union" + endpoints: + - port: http + interval: 1m + path: /metrics + honorLabels: true + metricRelabelings: + - sourceLabels: [ "__name__" ] + separator: ";" + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep +--- +# Source: dataplane/templates/monitoring/servicemonitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: union-service-monitor + namespace: union + labels: + release: release-name +spec: + selector: + matchLabels: + platform.union.ai/service-group: release-name + namespaceSelector: + matchNames: + - "union" + endpoints: + - port: debug + interval: 1m + path: /metrics + honorLabels: true diff --git a/tests/values/dataplane.baremetal-custom-s3.yaml b/tests/values/dataplane.baremetal-custom-s3.yaml new file mode 100644 index 00000000..46cf637e --- /dev/null +++ b/tests/values/dataplane.baremetal-custom-s3.yaml @@ -0,0 +1,63 @@ +host: acme.eu-west-2.unionai.cloud +clusterName: union-acme +orgName: acme +provider: metal +storage: + provider: custom + bucketName: union + fastRegistrationBucketName: union + custom: + type: stow + container: union + stow: + kind: s3 + config: + region: RNO2A + auth_type: accesskey + access_key_id: dummy-secret-value + secret_key: dummy-secret-value + endpoint: https://s3.example.com + disable_ssl: false + disable_force_path_style: "true" +secrets: + admin: + create: true + clientId: acme-union-acme-operator + clientSecret: dummy-secret-value +fluentbit: + enabled: false +prometheus: + enabled: false +config: + namespace_mapping: + template: "{{`{{ project }}`}}" + namespace_config: + namespace_mapping: + template: "{{`{{ project }}`}}" +namespaces: + enabled: false +imageBuilder: + enabled: true + defaultRepository: "https://ghcr.io/acme-corp/acme/union" + authenticationType: "noop" + buildkit: + enabled: true +image: + union: + tag: 2026.2.10 +flytepropeller: + enabled: false +executor: + task_logs: + plugins: + logs: + kubernetes-enabled: false + cloudwatch-enabled: false + templates: + - displayName: "Grafana Logs" + templateUris: + - 'https://grafana-infra.example.internal/explore?schemaVersion=1&panes=%7B%22pane%22%3A%7B%22datasource%22%3A%22af4lu074kfcaoc%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bnamespace%3D%5C%22{{ "{{" }} .namespace {{ "}}" }}%5C%22%2Cpod%3D%5C%22{{ "{{" }} .podName {{ "}}" }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22af4lu074kfcaoc%22%7D%2C%22editorMode%22%3A%22code%22%2C%22direction%22%3A%22backward%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22{{ "{{" }} .podUnixStartTime {{ "}}" }}000%22%2C%22to%22%3A%22now%22%7D%2C%22panelsState%22%3A%7B%22logs%22%3A%7B%22columns%22%3A%7B%220%22%3A%22Line%22%7D%2C%22visualisationType%22%3A%22logs%22%2C%22labelFieldName%22%3A%22labels%22%2C%22refId%22%3A%22A%22%7D%7D%2C%22compact%22%3Afalse%7D%7D&orgId=1' + propeller: + node-config: + disable-input-file-writes: true + persist-cache-status: true