From d61f7e636cc974ff05a4182a8512072c5e01e147 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Thu, 26 Feb 2026 08:07:24 -0800 Subject: [PATCH] fix: correct bare-metal custom S3 dataplane values Fixes 6 issues found during deployment review of a bare-metal S3-compatible storage deployment: 1. Remove https:// from imageBuilder.defaultRepository 2. Add storage.region at top level to match stow region (RNO2A) 3. Remove container from storage.custom (eliminates duplicate YAML key) 4. Disable opencost (requires prometheus which is disabled) 5. Clear AWS IAM role annotations (not applicable to bare-metal) 6. Disable monitoring and cost (require prometheus) Generated manifest reduced from 2765 to 1754 lines. Co-Authored-By: Claude Opus 4.6 --- .../dataplane.baremetal-custom-s3.yaml | 1041 +---------------- .../values/dataplane.baremetal-custom-s3.yaml | 12 +- 2 files changed, 25 insertions(+), 1028 deletions(-) diff --git a/tests/generated/dataplane.baremetal-custom-s3.yaml b/tests/generated/dataplane.baremetal-custom-s3.yaml index a6700606..71fd3a78 100644 --- a/tests/generated/dataplane.baremetal-custom-s3.yaml +++ b/tests/generated/dataplane.baremetal-custom-s3.yaml @@ -1,19 +1,4 @@ --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: opencost - namespace: union - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- # Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount @@ -115,9 +100,9 @@ data: - projectQuotaNvidiaGpu: value: "256" - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' + value: '' - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + value: '' - staging: - projectQuotaCpu: value: "4096" @@ -126,9 +111,9 @@ data: - projectQuotaNvidiaGpu: value: "256" - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' + value: '' - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + value: '' - development: - projectQuotaCpu: value: "4096" @@ -137,9 +122,9 @@ data: - projectQuotaNvidiaGpu: value: "256" - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' + value: '' - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' + value: '' refreshInterval: 5m standaloneDeployment: true templatePath: /etc/flyte/clusterresource/templates @@ -387,7 +372,6 @@ data: start-timeout: 30s storage: container: "union" - container: union stow: config: access_key_id: dummy-secret-value @@ -454,16 +438,15 @@ data: clusterData: appId: 'acme-union-acme-operator' bucketName: 'union' - bucketRegion: 'us-east-1' + bucketRegion: 'RNO2A' cloudHostName: 'acme.eu-west-2.unionai.cloud' gcpProjectId: '' metadataBucketPrefix: 's3://union' - userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - userRoleKey: 'eks.amazonaws.com/role-arn' + userRole: '' + userRoleKey: '' # -- storageType is only used when syncClusterConfig is enabled. It is intentionally disabled and it should not be used. storageType: custom customStorageConfig: | - container: union stow: config: access_key_id: dummy-secret-value @@ -493,7 +476,7 @@ data: proxy: imageBuilderConfig: authenticationType: 'noop' - defaultRepository: 'https://ghcr.io/acme-corp/acme/union' + defaultRepository: 'ghcr.io/acme-corp/acme/union' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -515,7 +498,6 @@ data: storage.yaml: | storage: container: "union" - container: union stow: config: access_key_id: dummy-secret-value @@ -536,7 +518,6 @@ data: fast_registration_storage.yaml: | fastRegistrationStorage: container: "union" - container: union stow: config: access_key_id: dummy-secret-value @@ -549,7 +530,7 @@ data: kind: s3 type: stow image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "https://ghcr.io/acme-corp/acme/union" + image-builder.default-repository: "ghcr.io/acme-corp/acme/union" image-builder.authentication-type: "noop" --- # Source: dataplane/templates/propeller/configmap.yaml @@ -579,93 +560,6 @@ data: serviceName: union-pod-webhook servicePort: '443' --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: opencost - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: [""] - resources: - - configmaps - - deployments - - nodes - - pods - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy - resources: - - poddisruptionbudgets - verbs: - - get - - list - - watch - - apiGroups: - - storage.k8s.io - resources: - - storageclasses - verbs: - - get - - list - - watch ---- # Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -836,27 +730,6 @@ rules: verbs: - get --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: opencost - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: opencost -subjects: - - kind: ServiceAccount - name: opencost - namespace: union ---- # Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1040,29 +913,6 @@ subjects: name: operator-system namespace: union --- -# Source: dataplane/charts/opencost/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: opencost - namespace: union - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -spec: - selector: - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 ---- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -1174,100 +1024,6 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: opencost - namespace: union - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - labels: - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - spec: - serviceAccountName: opencost - containers: - - name: opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent - args: - ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 - livenessProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 - readinessProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 10 - failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables ---- # Source: dataplane/templates/clusterresourcesync/deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -1288,7 +1044,7 @@ spec: template: metadata: annotations: - configChecksum: "2a4630966a6ebcfb81c2dc998659a64e8844e1d2ae84016711a90289d0d3e06" + configChecksum: "1dc62f3e23cb7a9ae96048da779d508c803352db29b947635e0a343846fd11e" labels: @@ -1503,7 +1259,7 @@ spec: template: metadata: annotations: - configChecksum: "c2db488297a3a7b7dbaaafdb87fd06ab53755f1d8f278b8c1d2c0df9c47d25e" + configChecksum: "e518d56c53e8fa3869ac11dfbc7fc2b8624fc18fd51d0fdd30a2ab23688b7b4" labels: app: executor @@ -1767,7 +1523,7 @@ spec: template: metadata: annotations: - configChecksum: "311737cdd59c75ac9e372353fecc4f9640f7a1216c46256b63a68ff3a4dfc35" + configChecksum: "13fb60edb107e058cee34471d9ccc99d6a7e9a9e1b257965cd4aa9c11a26a7c" labels: @@ -1904,7 +1660,7 @@ spec: template: metadata: annotations: - configChecksum: "311737cdd59c75ac9e372353fecc4f9640f7a1216c46256b63a68ff3a4dfc35" + configChecksum: "13fb60edb107e058cee34471d9ccc99d6a7e9a9e1b257965cd4aa9c11a26a7c" labels: @@ -1996,770 +1752,3 @@ spec: --- # Source: dataplane/templates/propeller/serviceaccount.yaml --- ---- -# Source: dataplane/templates/monitoring/prometheusrule.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: union-opencost-rules - namespace: union - labels: - release: release-name -spec: - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/monitoring/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: cost - namespace: union - labels: - release: release-name -spec: - selector: - matchLabels: - app.kubernetes.io/name: opencost - namespaceSelector: - matchNames: - - "union" - endpoints: - - port: http - interval: 1m - path: /metrics - honorLabels: true - metricRelabelings: - - sourceLabels: [ "__name__" ] - separator: ";" - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep ---- -# Source: dataplane/templates/monitoring/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: union-service-monitor - namespace: union - labels: - release: release-name -spec: - selector: - matchLabels: - platform.union.ai/service-group: release-name - namespaceSelector: - matchNames: - - "union" - endpoints: - - port: debug - interval: 1m - path: /metrics - honorLabels: true diff --git a/tests/values/dataplane.baremetal-custom-s3.yaml b/tests/values/dataplane.baremetal-custom-s3.yaml index 46cf637e..8d19dc29 100644 --- a/tests/values/dataplane.baremetal-custom-s3.yaml +++ b/tests/values/dataplane.baremetal-custom-s3.yaml @@ -2,13 +2,15 @@ host: acme.eu-west-2.unionai.cloud clusterName: union-acme orgName: acme provider: metal +userRoleAnnotationKey: "" +userRoleAnnotationValue: "" storage: provider: custom bucketName: union fastRegistrationBucketName: union + region: RNO2A custom: type: stow - container: union stow: kind: s3 config: @@ -28,6 +30,12 @@ fluentbit: enabled: false prometheus: enabled: false +opencost: + enabled: false +monitoring: + enabled: false +cost: + enabled: false config: namespace_mapping: template: "{{`{{ project }}`}}" @@ -38,7 +46,7 @@ namespaces: enabled: false imageBuilder: enabled: true - defaultRepository: "https://ghcr.io/acme-corp/acme/union" + defaultRepository: "ghcr.io/acme-corp/acme/union" authenticationType: "noop" buildkit: enabled: true